From c4545ba037f2e833d7cce04b5cded696c4296a3d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 7 May 2026 09:12:33 -0700
Subject: [PATCH 001/129] ZTS: use 'zpool trim -w' in zpool_trim_partial.ksh

Don't use trim_progress() which is racy to wait for the pool trim
to complete.  Instead use the wait (-w) option which is intended
for this.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18496
---
 .../cli_root/zpool_trim/zpool_trim_partial.ksh | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh
index e37f8e44c1a..a36649bc263 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh
@@ -73,7 +73,7 @@ log_must mkdir "$TESTDIR"
 log_must truncate -s $LARGESIZE "$LARGEFILE"
 log_must zpool create -O compression=off $TESTPOOL "$LARGEFILE"
 log_must mkfile $(( floor(LARGESIZE * 0.80) )) /$TESTPOOL/file
-sync_all_pools
+sync_pool $TESTPOOL
 
 new_size=$(du -k "$LARGEFILE" | awk '{print $1 * 1024}')
 log_must test $new_size -le $LARGESIZE
@@ -93,12 +93,8 @@ log_must test $new_size -gt $((4 * floor(LARGESIZE * 0.70) ))
 # Perform a partial trim, we expect it to skip most of the new metaslabs
 # which have never been used and therefore do not need be trimmed.
 log_must set_tunable64 TRIM_METASLAB_SKIP 1
-log_must zpool trim $TESTPOOL
-log_must set_tunable64 TRIM_METASLAB_SKIP 0
-
-while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do
-	sleep 0.5
-done
+log_must zpool trim -w $TESTPOOL
+sync_pool $TESTPOOL true
 
 new_size=$(du -k "$LARGEFILE" | awk '{print $1 * 1024}')
 log_must test $new_size -gt $LARGESIZE
@@ -106,11 +102,9 @@ log_must test $new_size -gt $LARGESIZE
 # Perform a full trim, all metaslabs will be trimmed the pool vdev
 # size will be reduced but not down to its original size due to the
 # space usage of the new metaslabs.
-log_must zpool trim $TESTPOOL
-
-while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do
-	sleep 0.5
-done
+log_must set_tunable64 TRIM_METASLAB_SKIP 0
+log_must zpool trim -w $TESTPOOL
+sync_pool $TESTPOOL true
 
 new_size=$(du -k "$LARGEFILE" | awk '{print $1 * 1024}')
 log_must test $new_size -le $(( 2 * LARGESIZE))

From 8fdc866757a96f0db77361554c8e1f33ecfc74a7 Mon Sep 17 00:00:00 2001
From: Gality <68463495+Gality369@users.noreply.github.com>
Date: Fri, 8 May 2026 00:14:20 +0800
Subject: [PATCH 002/129] zfs: annotate nested dd_lock in reservation sync
 accounting

When reservation sync updates a child's reserved space, it rolls the
delta into ancestor space accounting while still holding the child's
dd_lock.  That locking order is intentional, but Linux lockdep sees
the ancestor acquisition as recursive because it lacks a nested lock
subclass annotation.

Teach the reservation-sync space-accounting path to acquire ancestor
dd_lock instances with a nested subclass.  Keep the existing public
interfaces and accounting behavior unchanged by routing only the
ancestor rollup through local helpers.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
Signed-off-by: gality369 <gality369@example.com>
Closes #18497
---
 module/zfs/dsl_dir.c | 64 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index 2253b868b53..335b11dc2ff 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -1534,9 +1534,28 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 }
 
 /* call from syncing context when we actually write/free space for this dd */
-void
-dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
-    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+static void dsl_dir_diduse_transfer_space_impl(dsl_dir_t *dd, int64_t used,
+    int64_t compressed, int64_t uncompressed, int64_t tonew,
+    dd_used_t oldtype, dd_used_t newtype, boolean_t nested, dmu_tx_t *tx);
+
+static void
+dsl_dir_lock_enter(dsl_dir_t *dd, boolean_t nested)
+{
+	/*
+	 * lockdep needs an explicit subclass when a child dd_lock
+	 * nests an ancestor.
+	 */
+	if (nested) {
+		mutex_enter_nested(&dd->dd_lock, NESTED_SINGLE);
+	} else {
+		mutex_enter(&dd->dd_lock);
+	}
+}
+
+static void
+dsl_dir_diduse_space_impl(dsl_dir_t *dd, dd_used_t type,
+    int64_t used, int64_t compressed, int64_t uncompressed,
+    boolean_t nested, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
 
@@ -1554,7 +1573,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 	if (needlock)
-		mutex_enter(&dd->dd_lock);
+		dsl_dir_lock_enter(dd, nested);
 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
 	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
 	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
@@ -1582,12 +1601,20 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 		mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
-		dsl_dir_diduse_transfer_space(dd->dd_parent,
+		dsl_dir_diduse_transfer_space_impl(dd->dd_parent,
 		    accounted_delta, compressed, uncompressed,
-		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, nested, tx);
 	}
 }
 
+void
+dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used,
+    int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+	dsl_dir_diduse_space_impl(dd, type, used, compressed, uncompressed,
+	    B_FALSE, tx);
+}
+
 void
 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
@@ -1612,10 +1639,10 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 	mutex_exit(&dd->dd_lock);
 }
 
-void
-dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
+static void
+dsl_dir_diduse_transfer_space_impl(dsl_dir_t *dd, int64_t used,
     int64_t compressed, int64_t uncompressed, int64_t tonew,
-    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+	dd_used_t oldtype, dd_used_t newtype, boolean_t nested, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
 
@@ -1625,7 +1652,7 @@ dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
-	mutex_enter(&dd->dd_lock);
+	dsl_dir_lock_enter(dd, nested);
 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
 	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
 	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
@@ -1656,12 +1683,21 @@ dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
-		dsl_dir_diduse_transfer_space(dd->dd_parent,
+		dsl_dir_diduse_transfer_space_impl(dd->dd_parent,
 		    accounted_delta, compressed, uncompressed,
-		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, nested, tx);
 	}
 }
 
+void
+dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
+    int64_t compressed, int64_t uncompressed, int64_t tonew,
+    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+	dsl_dir_diduse_transfer_space_impl(dd, used, compressed,
+	    uncompressed, tonew, oldtype, newtype, B_FALSE, tx);
+}
+
 typedef struct dsl_dir_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
@@ -1828,8 +1864,8 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
-		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
-		    delta, 0, 0, tx);
+		dsl_dir_diduse_space_impl(dd->dd_parent, DD_USED_CHILD_RSRV,
+		    delta, 0, 0, B_TRUE, tx);
 	}
 	mutex_exit(&dd->dd_lock);
 }

From bdb8e8a2c5aba72e7a7b5887ab24f6a30711e73c Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 7 May 2026 09:15:16 -0700
Subject: [PATCH 003/129] ZTS: removal_with_export.ksh busy export

If the pool is active 'zpool export' will fail resulting in
a test failure.  Swap log_must with log_must_busy so the export
is retried when reported as busy before failing the test.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18498
---
 .../zfs-tests/tests/functional/removal/removal_with_export.ksh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh
index b94841aed15..cdd2d201e1a 100755
--- a/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh
+++ b/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh
@@ -29,7 +29,7 @@ log_onexit default_cleanup_noexit
 function callback
 {
 	test_removal_with_operation_kill
-	log_must zpool export $TESTPOOL
+	log_must_busy zpool export $TESTPOOL
 
 	#
 	# We are concurrently starting dd processes that will

From d65015938e195370930c501d26f9c73ca6a4c4fe Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Thu, 7 May 2026 12:16:39 -0400
Subject: [PATCH 004/129] Vdev allocation bias/class change

Normal, special and dedup vdevs differ only by space allocation
bias.  Normal and special vdevs might even legally store blocks
targeted to other classes.  Dedup vdevs don't normally do it, but
there is no real reason why they can't.  Considering this, it is
not impossible to change the allocation bias for those vdevs.

This change introduces a new top-level vdev property -- alloc_bias,
reporting current bias for the vdev, and allowing to change it.
This allows to easily change vdev role in a pool, especially if
vdev removal is impossible.  To not complicate the code, changes
take effect only on next pool import.

Changes to/from log vdev could also be theoretically possible, but
they are artificially blocked for now, partially due to additional
complications, and partially due to potential danger of placing
other blocks on log vdevs, that would otherwise be non-fatal.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alek Pinchuk <alek.pinchuk@connectwise.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18493
---
 include/sys/fs/zfs.h                          |  11 ++
 include/sys/vdev_impl.h                       |   8 --
 lib/libzfs/libzfs.abi                         |   8 +-
 lib/libzfs/libzfs_pool.c                      |   3 +
 man/man7/vdevprops.7                          |  15 +++
 module/zcommon/zpool_prop.c                   |  12 ++
 module/zfs/vdev.c                             |  77 +++++++++++++
 module/zfs/zfs_ioctl.c                        |   6 +
 tests/runfiles/common.run                     |   3 +-
 tests/zfs-tests/tests/Makefile.am             |   2 +
 .../alloc_class/alloc_class_014_pos.ksh       | 109 ++++++++++++++++++
 .../alloc_class/alloc_class_015_neg.ksh       |  91 +++++++++++++++
 12 files changed, 334 insertions(+), 11 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh

diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index fcef32ecf9f..d9b6e7654b0 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -476,6 +476,7 @@ typedef enum {
 	VDEV_PROP_SCHEDULER,
 	VDEV_PROP_FDOMAIN,
 	VDEV_PROP_FGROUP,
+	VDEV_PROP_ALLOC_BIAS,
 	VDEV_NUM_PROPS
 } vdev_prop_t;
 
@@ -491,6 +492,16 @@ typedef enum {
 	VDEV_SCHEDULER_OFF
 } vdev_scheduler_type_t;
 
+/*
+ * Allocation bias for top-level vdevs (alloc_bias property).
+ */
+typedef enum vdev_alloc_bias {
+	VDEV_BIAS_NONE,
+	VDEV_BIAS_LOG,		/* dedicated to ZIL data (SLOG) */
+	VDEV_BIAS_SPECIAL,	/* dedicated to ddt, metadata, and small blks */
+	VDEV_BIAS_DEDUP		/* dedicated to dedup metadata */
+} vdev_alloc_bias_t;
+
 /*
  * Dataset property functions shared between libzfs and kernel.
  */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 634594aca12..3c19b9abe9c 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -155,14 +155,6 @@ struct vdev_queue {
 	kmutex_t	vq_lock;
 };
 
-typedef enum vdev_alloc_bias {
-	VDEV_BIAS_NONE,
-	VDEV_BIAS_LOG,		/* dedicated to ZIL data (SLOG) */
-	VDEV_BIAS_SPECIAL,	/* dedicated to ddt, metadata, and small blks */
-	VDEV_BIAS_DEDUP		/* dedicated to dedup metadata */
-} vdev_alloc_bias_t;
-
-
 /*
  * On-disk indirect vdev state.
  *
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index ad28c876630..be74babbcba 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -2553,7 +2553,7 @@
     <typedef-decl name='__uint32_t' type-id='f0981eeb' id='62f1140c'/>
     <typedef-decl name='__uint64_t' type-id='7359adad' id='8910171f'/>
     <typedef-decl name='size_t' type-id='7359adad' id='b59d7dce'/>
-    <class-decl name='libzfs_handle' size-in-bits='18432' is-struct='yes' visibility='default' id='c8a9d9d8'>
+    <class-decl name='libzfs_handle' size-in-bits='18496' is-struct='yes' visibility='default' id='c8a9d9d8'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='libzfs_error' type-id='95e97e5e' visibility='default'/>
       </data-member>
@@ -2605,6 +2605,9 @@
       <data-member access='public' layout-offset-in-bits='18112'>
         <var-decl name='zh_mnttab' type-id='f20fbd51' visibility='default'/>
       </data-member>
+      <data-member access='public' layout-offset-in-bits='18432'>
+        <var-decl name='zh_mnttab_cache_enabled' type-id='c19b74c3' visibility='default'/>
+      </data-member>
     </class-decl>
     <class-decl name='zfs_handle' size-in-bits='4928' is-struct='yes' visibility='default' id='f6ee4445'>
       <data-member access='public' layout-offset-in-bits='0'>
@@ -6412,7 +6415,8 @@
       <enumerator name='VDEV_PROP_SCHEDULER' value='55'/>
       <enumerator name='VDEV_PROP_FDOMAIN' value='56'/>
       <enumerator name='VDEV_PROP_FGROUP' value='57'/>
-      <enumerator name='VDEV_NUM_PROPS' value='58'/>
+      <enumerator name='VDEV_PROP_ALLOC_BIAS' value='58'/>
+      <enumerator name='VDEV_NUM_PROPS' value='59'/>
     </enum-decl>
     <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
     <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 0b015d8bce6..fd957d98313 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5741,6 +5741,9 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
 				return (ENOENT);
 			if (prop == VDEV_PROP_SIT_OUT)
 				return (ENOENT);
+			/* Only valid for top-level vdevs */
+			if (prop == VDEV_PROP_ALLOC_BIAS)
+				return (ENOENT);
 		}
 		if (vdev_prop_index_to_string(prop, intval,
 		    (const char **)&strval) != 0)
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index b45128dd924..5f5e10723c1 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -218,6 +218,21 @@ If this device should perform new allocations, used to disable a device
 when it is scheduled for later removal.
 See
 .Xr zpool-remove 8 .
+.It Sy alloc_bias Ns = Ns Sy none Ns | Ns Sy log Ns | Ns Sy special Ns | Ns Sy dedup
+Controls the allocation class for a top-level vdev.
+Changes take effect after an export and import of the pool.
+Changing to/from log is not implemented, since it may lead to data loss in
+case of the log device failure.
+Setting to
+.Sy special
+and
+.Sy dedup
+requires
+.Sy feature@allocation_classes
+to be enabled.
+At least one top-level vdev must remain in the normal
+.Pq Sy none
+class.
 .It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off
 Controls how I/O requests are added to the vdev queue when reading or
 writing to this vdev.
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index ee86fe0c717..13a1390d1e1 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -388,6 +388,14 @@ vdev_prop_init(void)
 		{ NULL }
 	};
 
+	static const zprop_index_t vdev_alloc_bias_table[] = {
+		{ "none",	VDEV_BIAS_NONE },
+		{ "log",	VDEV_BIAS_LOG },
+		{ "special",	VDEV_BIAS_SPECIAL },
+		{ "dedup",	VDEV_BIAS_DEDUP },
+		{ NULL }
+	};
+
 	struct zfs_mod_supported_features *sfeatures =
 	    zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
 
@@ -556,6 +564,10 @@ vdev_prop_init(void)
 	    VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
 	    "auto | on | off", "IO_SCHEDULER",
 	    vdevschedulertype_table, sfeatures);
+	zprop_register_index(VDEV_PROP_ALLOC_BIAS, "alloc_bias",
+	    VDEV_BIAS_NONE, PROP_DEFAULT, ZFS_TYPE_VDEV,
+	    "none | log | special | dedup", "ALLOC_BIAS",
+	    vdev_alloc_bias_table, sfeatures);
 
 	/* hidden properties */
 	zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 4cc75ad5a25..9f083cd510f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -6093,6 +6093,29 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 				    strval);
 			}
 			break;
+		case VDEV_PROP_ALLOC_BIAS: {
+			intval = fnvpair_value_uint64(elem);
+			ASSERT3U(intval, !=, VDEV_BIAS_LOG);
+			const char *bias_str =
+			    (intval == VDEV_BIAS_SPECIAL) ?
+			    VDEV_ALLOC_BIAS_SPECIAL :
+			    (intval == VDEV_BIAS_DEDUP) ?
+			    VDEV_ALLOC_BIAS_DEDUP : NULL;
+			if (bias_str == NULL) {
+				(void) zap_remove(mos, objid,
+				    VDEV_TOP_ZAP_ALLOCATION_BIAS, tx);
+			} else {
+				VERIFY0(zap_update(mos, objid,
+				    VDEV_TOP_ZAP_ALLOCATION_BIAS,
+				    1, strlen(bias_str) + 1, bias_str, tx));
+				spa_activate_allocation_classes(spa, tx);
+			}
+			spa_history_log_internal(spa, "vdev set", tx,
+			    "vdev_guid=%llu: alloc_bias=%s",
+			    (u_longlong_t)vdev_guid,
+			    bias_str != NULL ? bias_str : "none");
+			break;
+		}
 		default:
 			/* normalize the property name */
 			propname = vdev_prop_to_name(prop);
@@ -6319,6 +6342,53 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			}
 			vd->vdev_scheduler = intval;
 			break;
+		case VDEV_PROP_ALLOC_BIAS:
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = EINVAL;
+				break;
+			}
+			if (vd != vd->vdev_top || vd->vdev_top_zap == 0) {
+				error = ENOTSUP;
+				break;
+			}
+			/* Log vdevs are not supported: remove and re-add. */
+			if (vd->vdev_islog) {
+				error = ENOTSUP;
+				break;
+			}
+			/* special/dedup needs allocation_classes feature */
+			if (intval != VDEV_BIAS_NONE &&
+			    ((intval != VDEV_BIAS_SPECIAL &&
+			    intval != VDEV_BIAS_DEDUP) ||
+			    !spa_feature_is_enabled(spa,
+			    SPA_FEATURE_ALLOCATION_CLASSES))) {
+				error = ENOTSUP;
+				break;
+			}
+			/*
+			 * Disallow converting the last normal vdev to
+			 * avoid pool suspension on failed allocations.
+			 */
+			if (intval != VDEV_BIAS_NONE &&
+			    vd->vdev_alloc_bias == VDEV_BIAS_NONE) {
+				vdev_t *rvd = spa->spa_root_vdev;
+				int normal = 0;
+				for (uint64_t c = 0;
+				    c < rvd->vdev_children; c++) {
+					vdev_t *cvd = rvd->vdev_child[c];
+					if (vdev_is_concrete(cvd) &&
+					    cvd->vdev_alloc_bias ==
+					    VDEV_BIAS_NONE &&
+					    !cvd->vdev_noalloc)
+						normal++;
+				}
+				if (normal <= 1) {
+					error = ENOTSUP;
+					break;
+				}
+			}
+			vd->vdev_alloc_bias = (vdev_alloc_bias_t)intval;
+			break;
 		default:
 			/* Most processing is done in vdev_props_set_sync */
 			break;
@@ -6746,6 +6816,13 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    boolval, src);
 				break;
+			case VDEV_PROP_ALLOC_BIAS:
+				if (vd == vd->vdev_top) {
+					vdev_prop_add_list(outnvl, propname,
+					    NULL, vd->vdev_alloc_bias,
+					    ZPROP_SRC_NONE);
+				}
+				continue;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index fe98e7db073..d31aa80641c 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3456,12 +3456,15 @@ zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	ASSERT(spa_writeable(spa));
 
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	error = vdev_prop_set(vd, innvl, outnvl);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_close(spa, FTAG);
 
@@ -3500,12 +3503,15 @@ zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
 		return (error);
 
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	error = vdev_prop_get(vd, innvl, outnvl);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_close(spa, FTAG);
 
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 4c7e4e85ec0..df80437ad0c 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -37,7 +37,8 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
-    'alloc_class_013_pos', 'alloc_class_016_pos']
+    'alloc_class_013_pos', 'alloc_class_014_pos', 'alloc_class_015_neg',
+    'alloc_class_016_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/append]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index c4bcfea5595..a6242ba0f52 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -434,6 +434,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
+	functional/alloc_class/alloc_class_014_pos.ksh \
+	functional/alloc_class/alloc_class_015_neg.ksh \
 	functional/alloc_class/alloc_class_016_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh
new file mode 100755
index 00000000000..27c55bc5906
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh
@@ -0,0 +1,109 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+#	The alloc_bias vdev property is readable and settable on top-level vdevs.
+#
+# STRATEGY:
+#	1. Create a pool with one normal mirror and one special mirror.
+#	2. Verify alloc_bias getter returns "none" for normal and "special"
+#	   for the special mirror.
+#	3. Verify alloc_bias is not reported for leaf (child) vdevs.
+#	4. Set alloc_bias=none on the special vdev; verify getter returns "none".
+#	5. Export and import the pool; verify no "special" section in status.
+#	6. Set alloc_bias=dedup on the same vdev; verify getter returns "dedup".
+#	7. Export and import the pool; verify "dedup" section appears in status.
+#	8. Set alloc_bias=special; verify getter returns "special".
+#	9. Export and import; verify "special" section appears again.
+#
+
+verify_runnable "global"
+
+claim="alloc_bias vdev property is readable and settable on top-level vdevs"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+# One normal mirror (always stays normal) and one special mirror.
+# The normal mirror ensures the pool always has normal-class vdevs
+# regardless of what we do to the second mirror.
+log_must zpool create $TESTPOOL \
+    mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
+    special mirror $CLASS_DISK0 $CLASS_DISK1
+
+# Find the special vdev name (mirror-N) from zpool status.
+TVDEV=$(zpool status $TESTPOOL | \
+    awk '/special/{found=1} found && /mirror-/{print $1; exit}')
+log_note "Special vdev: $TVDEV"
+[[ -n "$TVDEV" ]] || log_fail "Could not determine special vdev name"
+
+# Verify initial alloc_bias values.
+BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL mirror-0)
+[[ "$BIAS" == "none" ]] || \
+    log_fail "Normal mirror alloc_bias: expected none, got $BIAS"
+
+BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV)
+[[ "$BIAS" == "special" ]] || \
+    log_fail "Special mirror alloc_bias: expected special, got $BIAS"
+
+# Verify alloc_bias is not reported for a leaf vdev.
+LEAF_OUT=$(zpool get -H -o name,value alloc_bias $TESTPOOL \
+    $ZPOOL_DISK0 2>&1)
+[[ -z "$LEAF_OUT" ]] || \
+    log_fail "alloc_bias reported for leaf vdev, got: $LEAF_OUT"
+
+# --- special -> none, verify after export/import ---
+log_must zpool set alloc_bias=none $TESTPOOL $TVDEV
+BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV)
+[[ "$BIAS" == "none" ]] || \
+    log_fail "After set none: alloc_bias expected none, got $BIAS"
+
+log_must zpool export $TESTPOOL
+log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
+zpool status $TESTPOOL | grep -q "special" && \
+    log_fail "special still shown after alloc_bias=none + reimport"
+
+# --- none -> dedup, verify after export/import ---
+log_must zpool set alloc_bias=dedup $TESTPOOL $TVDEV
+BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV)
+[[ "$BIAS" == "dedup" ]] || \
+    log_fail "After set dedup alloc_bias expected dedup, got $BIAS"
+
+log_must zpool export $TESTPOOL
+log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
+zpool status $TESTPOOL | grep -q "dedup" || \
+    log_fail "dedup not shown after alloc_bias=dedup + reimport"
+
+# --- dedup -> special, verify after export/import ---
+log_must zpool set alloc_bias=special $TESTPOOL $TVDEV
+BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV)
+[[ "$BIAS" == "special" ]] || \
+    log_fail "After set special alloc_bias expected special, got $BIAS"
+
+log_must zpool export $TESTPOOL
+log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
+zpool status $TESTPOOL | grep -q "special" || \
+    log_fail "special not shown after alloc_bias=special + reimport"
+
+log_must zpool destroy -f $TESTPOOL
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh
new file mode 100755
index 00000000000..43740690b3c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh
@@ -0,0 +1,91 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+#	Setting the alloc_bias vdev property to invalid values or on
+#	unsupported vdev types fails.
+#
+# STRATEGY:
+#	1. Create a pool with a normal mirror and a log vdev.
+#	2. Verify setting alloc_bias on a leaf vdev fails.
+#	3. Verify setting alloc_bias=log fails.
+#	4. Verify setting alloc_bias to an unknown value fails.
+#	5. Verify setting alloc_bias on a log vdev fails.
+#	6. Verify setting alloc_bias=special fails when allocation_classes
+#	   feature is not enabled.
+#	7. Verify converting the last normal vdev fails.
+#
+
+verify_runnable "global"
+
+claim="Setting alloc_bias to invalid values or on unsupported vdevs fails"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+# Create a pool with a normal mirror and a log vdev.
+log_must zpool create $TESTPOOL \
+    mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
+    log $CLASS_DISK0
+
+NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}')
+log_note "Normal vdev: $NORMAL_VDEV"
+
+# Setting alloc_bias on a leaf vdev must fail.
+log_mustnot zpool set alloc_bias=special $TESTPOOL $ZPOOL_DISK0
+
+# Setting alloc_bias=log must fail (log vdevs must be removed and re-added).
+log_mustnot zpool set alloc_bias=log $TESTPOOL $NORMAL_VDEV
+
+# Setting alloc_bias to an unknown value must fail.
+log_mustnot zpool set alloc_bias=bogus $TESTPOOL $NORMAL_VDEV
+
+# Setting alloc_bias on a log vdev must fail.
+# CLASS_DISK0 is a single-disk (non-mirror) top-level log vdev.
+log_mustnot zpool set alloc_bias=special $TESTPOOL $CLASS_DISK0
+
+log_must zpool destroy -f $TESTPOOL
+
+# Verify setting alloc_bias=special fails when allocation_classes is disabled.
+# Create a pool with the allocation_classes feature explicitly disabled.
+log_must zpool create -o feature@allocation_classes=disabled $TESTPOOL \
+    mirror $ZPOOL_DISK0 $ZPOOL_DISK1
+
+NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}')
+log_mustnot zpool set alloc_bias=special $TESTPOOL $NORMAL_VDEV
+log_mustnot zpool set alloc_bias=dedup $TESTPOOL $NORMAL_VDEV
+
+log_must zpool destroy -f $TESTPOOL
+
+# Verify that converting the last normal-class top-level vdev fails.
+# A pool must always retain at least one normal vdev.
+log_must zpool create $TESTPOOL \
+    mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
+    special mirror $CLASS_DISK0 $CLASS_DISK1
+
+NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}')
+log_mustnot zpool set alloc_bias=special $TESTPOOL $NORMAL_VDEV
+log_mustnot zpool set alloc_bias=dedup $TESTPOOL $NORMAL_VDEV
+
+log_must zpool destroy -f $TESTPOOL
+log_pass $claim

From 500b44eef286870e5cb8447c75ce007ef992f840 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 7 May 2026 09:54:45 -0700
Subject: [PATCH 005/129] ZTS: zpool_iostat_002_pos remove sleep

In the CI environment commands may occasionally take longer than
expected.  For zpool_iostat_002_pos this can cause a failure if fewer
than the expected numbers of lines are logged in time.  To prevent
this issue relax the time constraint and simply verify the command
ran to completion and generate the correct number of lines.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18501
---
 .../cli_user/zpool_iostat/zpool_iostat_002_pos.ksh        | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
index a2b3464b2bf..b1c12f1306a 100755
--- a/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh
@@ -38,9 +38,8 @@
 #
 # STRATEGY:
 # 1. Set the interval to 1 and count to 4.
-# 2. Sleep for 5 seconds.
-# 3. Verify that the output has 4 records.
-# 4. Set interval to 0.5 and count to 1 to test floating point intervals.
+# 2. Verify that the output has 4 records.
+# 3. Set interval to 0.5 and count to 1 to test floating point intervals.
 
 verify_runnable "both"
 
@@ -61,8 +60,7 @@ if ! is_global_zone ; then
 	TESTPOOL=${TESTPOOL%%/*}
 fi
 
-log_must eval "zpool iostat $TESTPOOL 1 4 > $tmpfile 2>&1 &"
-log_must sleep 5
+log_must eval "zpool iostat $TESTPOOL 1 4 > $tmpfile 2>&1"
 stat_count=$(grep -c $TESTPOOL $tmpfile)
 
 if [[ $stat_count -ne 4 ]]; then

From 439b802e77166e3a58cfd364105a711798ca0743 Mon Sep 17 00:00:00 2001
From: Gality <68463495+Gality369@users.noreply.github.com>
Date: Fri, 8 May 2026 04:20:44 +0800
Subject: [PATCH 006/129] sa: fix sa_add_projid lock ordering

sa_add_projid() currently acquires hdl->sa_lock before zp->z_lock.
Several same-znode update paths take zp->z_lock and then call
sa_update() or sa_bulk_update() on the same SA handle.

On Linux, FS_IOC_FSSETXATTR reaches zfs_setattr() through
zpl_ioctl_setxattr() without outer inode serialization. This makes
the reversed lock order a real ABBA deadlock rather than a lockdep
false positive when projid is added to an old-format inode while
another thread updates the same znode.

Acquire zp->z_lock before hdl->sa_lock in sa_add_projid() to match
the existing znode update ordering.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
Co-authored-by: gality369 <gality369@example.com>
Closes #18503
---
 module/zfs/sa.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index bd565bb7101..c6b36474b9f 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -1605,8 +1605,8 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
 
 	bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 	attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
-	mutex_enter(&hdl->sa_lock);
 	mutex_enter(&zp->z_lock);
+	mutex_enter(&hdl->sa_lock);
 
 	err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
 	    sizeof (uint64_t));
@@ -1750,8 +1750,8 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
 	zp->z_is_sa = B_TRUE;
 
 out:
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&hdl->sa_lock);
+	mutex_exit(&zp->z_lock);
 	kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 	kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
 	if (dxattr_obj)

From a2d053329c2aa4327a5d80fb9aeaa8455e5527e0 Mon Sep 17 00:00:00 2001
From: Sean Eric Fagan <kithrup@users.noreply.github.com>
Date: Thu, 7 May 2026 21:22:38 +0100
Subject: [PATCH 007/129] Add some more file layout output, triggered by -v

With one -v, the block type (parity or data) is printed (matching
the ASCII-art version); with two -v, the offset into the file is
also printed.

This also updates the man page, and adds some simple
test scripts.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Sean Fagan <sean.fagan@klarasystems.com>
Signed-off-by: Sean Fagan <sean.fagan@klarasystems.com>
Closes #18470
---
 cmd/zdb/zdb.c                                 | 64 +++++++++++----
 man/man8/zdb.8                                |  7 +-
 tests/runfiles/common.run                     |  7 +-
 tests/zfs-tests/tests/Makefile.am             |  4 +
 .../cli_root/zdb/zdb_file_layout_001.ksh      | 78 +++++++++++++++++++
 .../cli_root/zdb/zdb_file_layout_002.ksh      | 78 +++++++++++++++++++
 .../cli_root/zdb/zdb_file_layout_003.ksh      | 78 +++++++++++++++++++
 .../cli_root/zdb/zdb_file_layout_neg.ksh      | 57 ++++++++++++++
 8 files changed, 353 insertions(+), 20 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 68c9696a8aa..a2e65de6662 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2802,18 +2802,18 @@ print_file_layout_raidz(vdev_t *vd, blkptr_t *bp, uint64_t file_offset,
 	    vd->vdev_children, vdrz->vd_nparity);
 	raidz_row_t *rr = rm->rm_row[0];
 
-	/*
-	 * Account for out of order disks in raidz1.
-	 * For now just reverse them back and adjust for it later.
-	 */
-	if (rr->rr_firstdatacol == 1 && (zio.io_offset & (1ULL << 20))) {
-		uint64_t devidx = rr->rr_col[0].rc_devidx;
-		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
-		rr->rr_col[1].rc_devidx = devidx;
-	}
-
 	if (!dump_opt['H']) {
 		int last_disk = vd->vdev_children - 1;
+		/*
+		 * Account for out of order disks in raidz1.
+		 * For now just reverse them back and adjust for it later.
+		 */
+		if (rr->rr_firstdatacol == 1 &&
+		    (zio.io_offset & (1ULL << 20))) {
+			uint64_t devidx = rr->rr_col[0].rc_devidx;
+			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+			rr->rr_col[1].rc_devidx = devidx;
+		}
 		int first_disk = rr->rr_col[0].rc_devidx;
 
 		(void) printf("%12llx", (u_longlong_t)file_offset);
@@ -2843,23 +2843,49 @@ print_file_layout_raidz(vdev_t *vd, blkptr_t *bp, uint64_t file_offset,
 		static uint64_t next_offset = 0;
 
 		if (next_offset != file_offset) {
-			(void) printf("skip hole\t-\t%llx\n",
-			    (u_longlong_t)((file_offset - next_offset) >>
-			    vd->vdev_ashift));
+			(void) printf("skip hole\t-\t\t%lld\n",
+			    (u_longlong_t)((file_offset - next_offset) / 512));
 		}
 		next_offset = file_offset + BP_GET_LSIZE(bp);
+		uint64_t tmp_offset = file_offset;
+
 
 		for (int c = 0; c < rr->rr_cols; c++) {
+			boolean_t pcol = c < rr->rr_firstdatacol;
 			raidz_col_t *rc = &rr->rr_col[c];
 			char *path = vd->vdev_child[rc->rc_devidx]->vdev_path;
-			// c < rr->rr_firstdatacol
+
 			if (rc->rc_size == 0)
 				continue;
-			(void) printf("%s\t%llu\t%d\n",
+			(void) printf("%s\t\t%llu\t%d",
 			    zfs_basename(path),
 			    (u_longlong_t)(rc->rc_offset +
 			    VDEV_LABEL_START_SIZE)/512,
 			    (int)rc->rc_size/512);
+			if (dump_opt['v']) {
+				char label = pcol ? 'P' : 'D';
+				int num;
+
+				if (c < 2) {
+					num = 0;
+				} else {
+					num = pcol ? c :
+					    (c - rr->rr_firstdatacol);
+				}
+				printf("\t%c%d", label, num);
+				if (dump_opt['v'] > 1) {
+					unsigned long long off;
+					if (pcol)
+						off = file_offset;
+					else
+						off = tmp_offset;
+					off = off / 512ULL;
+					printf("\t%llu", off);
+				}
+			}
+			if (!pcol)
+				tmp_offset += rc->rc_size;
+			printf("\n");
 		}
 	}
 }
@@ -2989,7 +3015,12 @@ dump_indirect_layout(dnode_t *dn)
 	 * Start layout with a header
 	 */
 	if (dump_opt['H']) {
-		(void) printf("DISK\t\tLBA\t\tCOUNT\n");
+		(void) printf("DISK\t\t\tLBA\tCOUNT");
+		if (dump_opt['v'])
+			(void) printf("\tTYPE");
+		if (dump_opt['v'] > 1)
+			(void) printf("\tOFFSET");
+		printf("\n");
 	} else {
 		char diskhdr[16];
 
@@ -10519,6 +10550,7 @@ main(int argc, char **argv)
 		}
 
 		if (dump_opt['f'] && os != NULL) {
+			dump_opt['v'] = verbose;
 			dump_file_data_layout(os);
 		} else if (dump_opt['B']) {
 			dump_backup(target, objset_id,
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index f500e7e8a13..596e1d94e39 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -284,10 +284,15 @@ Decode and display block from an embedded block pointer specified by the
 arguments.
 .It Fl f , -file-layout
 Display the file layout of an object for the disks of a raidz vdev.
+Numeric values in the disply are hexadecimal.
 With
 .Fl H ,
 the output is in scripted mode for easy parsing, with all values
-being presented as 512 byte blocks.
+being presented as 512 byte blocks in decimal; with
+.Fl v ,
+the block type (parity or data) is displayed; with
+.Fl vv ,
+the offset into the file for each block is also printed.
 Only a single top-level raidz vdev is supported.
 .It Fl h , -history
 Display pool history similar to
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index df80437ad0c..14e4bd79f85 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -173,9 +173,10 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
     'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
     'zdb_display_block', 'zdb_encrypted', 'zdb_encrypted_raw',
-    'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos',
-    'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2',
-    'zdb_backup', 'zdb_tunables']
+    'zdb_file_layout_001', 'zdb_file_layout_002', 'zdb_file_layout_003',
+    'zdb_file_layout_neg', 'zdb_label_checksum', 'zdb_object_range_neg',
+    'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd',
+    'zdb_recover', 'zdb_recover_2', 'zdb_backup', 'zdb_tunables']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index a6242ba0f52..28acc6f3af1 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -650,6 +650,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zdb/zdb_encrypted.ksh \
 	functional/cli_root/zdb/zdb_encrypted_raw.ksh \
 	functional/cli_root/zdb/zdb_label_checksum.ksh \
+	functional/cli_root/zdb/zdb_file_layout_001.ksh \
+	functional/cli_root/zdb/zdb_file_layout_002.ksh \
+	functional/cli_root/zdb/zdb_file_layout_003.ksh \
+	functional/cli_root/zdb/zdb_file_layout_neg.ksh \
 	functional/cli_root/zdb/zdb_object_range_neg.ksh \
 	functional/cli_root/zdb/zdb_object_range_pos.ksh \
 	functional/cli_root/zdb/zdb_objset_id.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh
new file mode 100755
index 00000000000..f9c9555b84b
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Datto, Inc. All rights reserved.
+# Copyright (c) 2026, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# zdb -fHv <dataset> <objnum> will display block
+# layouts for the object.
+#
+# Strategery:
+# 1. Create a RAIDZ1 pool, set compression to none
+# 2. Create a file filled with random data
+# 3. Get the inode number of the file
+# 4. Run zdb -fHv <pool>/ <inum> & extract file
+# 5. Compare real file and extracted file.
+
+DATA=/$TESTPOOL1/random.bin
+BLOCKS=$(( $RANDOM % 16 ))
+COMPARE=/tmp/compare.$$
+
+function cleanup
+{
+    destroy_pool $TESTPOOL1
+    rm -f $TESTDIR/file?.bin $COMPARE
+}
+
+log_assert "Verify zdb -fHv displays correct offsets"
+log_onexit cleanup
+
+# 1. Create a RAIDZ1 pool
+log_must mkdir -p $TESTDIR
+for file in 1 2 3 4 5
+do
+    rm -f $TESTDIR/file${file}.bin
+    touch $TESTDIR/file${file}.bin
+    log_must truncate -s 128m $TESTDIR/file${file}.bin
+done
+
+log_must zpool create -O compression=off -O recordsize=16K $TESTPOOL1 raidz1 $TESTDIR/file[12345].bin
+zfs get compression,recordsize $TESTPOOL1
+# 2. Create a file with random data
+log_must rm -f $DATA
+log_must dd if=/dev/urandom of=${DATA} bs=16k count=${BLOCKS} > /dev/null 2>&1
+log_must zpool sync $TESTPOOL1
+
+# 3. Get the inode number of the file
+INUM=$(ls -li $DATA | cut -f1 -d ' ')
+
+# 4. Extract the contents of the file using dd
+rm -f $COMPARE
+log_must touch ${COMPARE}
+log_must zdb -fHv $TESTPOOL1/ ${INUM} |  grep 'D.$' |
+    while read file offset count rest
+    do
+	log_must sh -c "dd if=$TESTDIR/${file} bs=512 skip=${offset} count=${count} >> ${COMPARE}"
+    done
+
+# 5. Compare files
+log_must cmp  ${COMPARE} ${DATA}
+
+log_pass "'zdb -fHv' works as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh
new file mode 100755
index 00000000000..455ec6ccb21
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Datto, Inc. All rights reserved.
+# Copyright (c) 2026, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# zdb -fHv <dataset> <objnum> will display block
+# layouts for the object.
+#
+# Strategery:
+# 1. Create a RAIDZ2 pool, set compression to none
+# 2. Create a file filled with random data
+# 3. Get the inode number of the file
+# 4. Run zdb -fHv <pool>/ <inum> & extract file
+# 5. Compare real file and extracted file.
+
+DATA=/$TESTPOOL1/random.bin
+BLOCKS=$(( $RANDOM % 16 ))
+COMPARE=/tmp/compare.$$
+
+function cleanup
+{
+    destroy_pool $TESTPOOL1
+    rm -f $TESTDIR/file?.bin $COMPARE
+}
+
+log_assert "Verify zdb -fHv displays correct offsets"
+log_onexit cleanup
+
+# 1. Create a RAIDZ1 pool
+log_must mkdir -p $TESTDIR
+for file in 1 2 3 4 5 6
+do
+    rm -f $TESTDIR/file${file}.bin
+    touch $TESTDIR/file${file}.bin
+    log_must truncate -s 128m $TESTDIR/file${file}.bin
+done
+
+log_must zpool create -O compression=off -O recordsize=16K $TESTPOOL1 raidz2 $TESTDIR/file[123456].bin
+zfs get compression,recordsize $TESTPOOL1
+# 2. Create a file with random data
+log_must rm -f $DATA
+log_must dd if=/dev/urandom of=${DATA} bs=16k count=${BLOCKS} > /dev/null 2>&1
+log_must zpool sync $TESTPOOL1
+
+# 3. Get the inode number of the file
+INUM=$(ls -li $DATA | cut -f1 -d ' ')
+
+# 4. Extract the contents of the file using dd
+rm -f $COMPARE
+log_must touch ${COMPARE}
+log_must zdb -fHv $TESTPOOL1/ ${INUM} |  grep 'D.$' |
+    while read file offset count rest
+    do
+	log_must sh -c "dd if=$TESTDIR/${file} bs=512 skip=${offset} count=${count} >> ${COMPARE}"
+    done
+
+# 5. Compare files
+log_must cmp  ${COMPARE} ${DATA}
+
+log_pass "'zdb -fHv' works as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh
new file mode 100755
index 00000000000..7673b3488c7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Datto, Inc. All rights reserved.
+# Copyright (c) 2026, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# zdb -fHv <dataset> <objnum> will display block
+# layouts for the object.
+#
+# Strategery:
+# 1. Create a RAIDZ3 pool, set compression to none
+# 2. Create a file filled with random data
+# 3. Get the inode number of the file
+# 4. Run zdb -fHv <pool>/ <inum> & extract file
+# 5. Compare real file and extracted file.
+
+DATA=/$TESTPOOL1/random.bin
+BLOCKS=$(( $RANDOM % 16 ))
+COMPARE=/tmp/compare.$$
+
+function cleanup
+{
+    destroy_pool $TESTPOOL1
+    rm -f $TESTDIR/file?.bin $COMPARE
+}
+
+log_assert "Verify zdb -fHv displays correct offsets"
+log_onexit cleanup
+
+# 1. Create a RAIDZ1 pool
+log_must mkdir -p $TESTDIR
+for file in 1 2 3 4 5 6 7
+do
+    rm -f $TESTDIR/file${file}.bin
+    touch $TESTDIR/file${file}.bin
+    log_must truncate -s 128m $TESTDIR/file${file}.bin
+done
+
+log_must zpool create -O compression=off -O recordsize=16K $TESTPOOL1 raidz3 $TESTDIR/file[123456].bin
+zfs get compression,recordsize $TESTPOOL1
+# 2. Create a file with random data
+log_must rm -f $DATA
+log_must dd if=/dev/urandom of=${DATA} bs=16k count=${BLOCKS} > /dev/null 2>&1
+log_must zpool sync $TESTPOOL1
+
+# 3. Get the inode number of the file
+INUM=$(ls -li $DATA | cut -f1 -d ' ')
+
+# 4. Extract the contents of the file using dd
+rm -f $COMPARE
+log_must touch ${COMPARE}
+log_must zdb -fHv $TESTPOOL1/ ${INUM} |  grep 'D.$' |
+    while read file offset count rest
+    do
+	log_must sh -c "dd if=$TESTDIR/${file} bs=512 skip=${offset} count=${count} >> ${COMPARE}"
+    done
+
+# 5. Compare files
+log_must cmp  ${COMPARE} ${DATA}
+
+log_pass "'zdb -fHv' works as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh
new file mode 100755
index 00000000000..124bdb6b6b3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh
@@ -0,0 +1,57 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Datto, Inc. All rights reserved.
+# Copyright (c) 2026, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# Ensure zdb -f only works on raidz
+#
+# Strategery:
+# 1. Create a pool with one disk
+# 2. Create a file
+# 3. Get the inode number of the file
+# 4. Run zdb -f
+# 5. Confirm failure status
+
+function cleanup
+{
+    destroy_pool $TESTPOOL1
+    rm -f $TESTDIR/file1.bin
+}
+
+log_assert "Verify zdb -f fails on non-raidz pool"
+log_onexit cleanup
+
+# 1. Create a RAIDZ1 pool
+log_must mkdir -p $TESTDIR
+touch $TESTDIR/file1.bin
+log_must truncate -s 128m $TESTDIR/file1.bin
+log_must zpool create -f $TESTPOOL1 $TESTDIR/file1.bin
+
+# 2. Create a file
+log_must touch /$TESTPOOL1/file.txt
+
+# 3. Get the inode number of the file
+INUM=$(ls -li /$TESTDIR/file1.txt | cut -f1 -d ' ')
+
+# 4. Run zdb -f
+log_mustnot zdb -f $TESTDIR/ $INUM
+
+log_pass "'zdb -f' fails on non-raidz as expected."

From 6a25950e72962f9e8def71504c4a43a13baa87d0 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 7 May 2026 15:57:07 -0700
Subject: [PATCH 008/129] ZTS: redundancy_draid_spare1

Preserve the 'zpool status' output used to calculate the number of
checksum errors so it can be logged on failure.  Several instances have
been observed in the CI where cksum was set to a non-zero value, yet a
subsequent run of 'zpool status' on failure showed no checksum errors.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18500
---
 .../functional/redundancy/redundancy.kshlib   | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib
index 2b5a28b0620..ae8a4b2a648 100644
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib
@@ -435,32 +435,38 @@ function verify_draid_pool
 
 	log_note "verify_draid_pool $pool $replace_mode"
 	log_must zpool scrub -w $pool
+	sync_pool $pool true
 
-	typeset -i cksum=$(cksum_pool $pool)
+	typeset status=$(zpool status -p $pool)
+	typeset -i cksum=$(echo "$status" | awk '
+	    !NF { isvdev = 0 }
+	    isvdev { errors += $NF }
+	    /CKSUM$/ { isvdev = 1 }
+	    END { print errors }')
 
 	if [[ "$replace_mode" = "healing" ]]; then
 		if [[ $cksum -gt 0 ]]; then
-			log_must zpool status -v $pool
+			log_note "$status"
 			log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
 		fi
 
 		if ! check_pool_status $pool "scan" "repaired 0B"; then
-			log_must zpool status -v $pool
+			log_note "$status"
 			log_fail "Unexpected repair IO found for $pool ($cksum)"
 		fi
 	elif [[ "$replace_mode" = "sequential" ]]; then
 		if [[ $cksum -gt 0 ]]; then
-			log_must zpool status -v $pool
+			log_note "$status"
 			log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
 		fi
 	elif [[ "$replace_mode" = "damaged" ]]; then
 		if [[ $cksum -lt 1 ]]; then
-			log_must zpool status -v $pool
+			log_note "$status"
 			log_fail "Expected CKSUM errors missing for $pool ($cksum)"
 		fi
 
 		if check_pool_status $pool "scan" "repaired 0B"; then
-			log_must zpool status -v $pool
+			log_note "$status"
 			log_fail "Expected repair IO missing for $pool ($cksum)"
 		fi
 	else
@@ -468,12 +474,12 @@ function verify_draid_pool
 	fi
 
 	if ! check_pool_status $pool "scan" "with 0 errors"; then
-		log_must zpool status -v $pool
+		log_note "$status"
 		log_fail "Unexpected repair errors found for $pool"
 	fi
 
 	if ! check_pool_status $pool "errors" "No known data errors"; then
-		log_must zpool status -v $pool
+		log_note "$status"
 		log_fail "Unexpected data errors found for $pool"
 	fi
 }

From 956deba27b5601d8b412455c0e449de7f7e52f80 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Fri, 8 May 2026 14:34:59 -0400
Subject: [PATCH 009/129] zdb: detect BRT and DDT leaks during block traversal

During -b traversal, track BRT and DDT reference counts and report
blocks claimed more times than their reference tables account for
if it causes claim errors, instead of just asserting it.  Also
report entries with references not fully consumed by the traversal.

Add zdb leaks checks to cloning and dedup tests. This should make
sure the pools are in a sane state after completing the functional
tests.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18494
---
 cmd/zdb/zdb.c                                 | 165 ++++++++++++------
 .../bclone/bclone_crossfs_corner_cases.ksh    |   3 +
 .../functional/bclone/bclone_crossfs_data.ksh |   3 +
 .../bclone/bclone_samefs_corner_cases.ksh     |   3 +
 .../functional/bclone/bclone_samefs_data.ksh  |   3 +
 .../block_cloning_after_device_removal.ksh    |   4 +
 .../block_cloning_lwb_buffer_overflow.ksh     |   3 +
 .../block_cloning/block_cloning_replay.ksh    |   3 +
 .../block_cloning_replay_encrypted.ksh        |   3 +
 .../functional/dedup/dedup_fdt_create.ksh     |   2 +
 .../functional/dedup/dedup_fdt_import.ksh     |   2 +
 .../functional/dedup/dedup_fdt_pacing.ksh     |   2 +
 .../functional/dedup/dedup_legacy_create.ksh  |   2 +
 .../dedup/dedup_legacy_fdt_mixed.ksh          |   2 +
 .../dedup/dedup_legacy_fdt_upgrade.ksh        |   2 +
 .../functional/dedup/dedup_legacy_import.ksh  |   2 +
 .../tests/functional/dedup/dedup_prune.ksh    |   1 +
 .../functional/dedup/dedup_zap_shrink.ksh     |   2 +
 18 files changed, 156 insertions(+), 51 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index a2e65de6662..1dcd70f628b 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -6356,22 +6356,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	int i;
+	boolean_t claimed = B_FALSE;
+	boolean_t ddt_block = B_FALSE;
+	boolean_t brt_block = B_FALSE;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
-	/*
-	 * This flag controls if we will issue a claim for the block while
-	 * counting it, to ensure that all blocks are referenced in space maps.
-	 * We don't issue claims if we're not doing leak tracking, because it's
-	 * expensive if the user isn't interested. We also don't claim the
-	 * second or later occurences of cloned or dedup'd blocks, because we
-	 * already claimed them the first time.
-	 */
-	boolean_t do_claim = !dump_opt['L'];
-
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
 	blkptr_t tempbp;
@@ -6402,21 +6395,30 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE);
 
 		/*
-		 * ddt_lookup() can return NULL if this block didn't exist
-		 * in the DDT and creating it would take the DDT over its
-		 * quota. Since we got the block from disk, it must exist in
-		 * the DDT, so this can't happen. However, when unique entries
-		 * are pruned, the dedup bit can be set with no corresponding
-		 * entry in the DDT.
+		 * ddt_lookup() can return NULL when unique entries are pruned
+		 * from the DDT.
 		 */
 		if (dde == NULL) {
 			ddt_exit(ddt);
-			goto skipped;
+			goto ddt_done;
 		}
 
 		/* Get the phys for this variant */
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 
+		/*
+		 * DDT_PHYS_NONE means the block has the dedup bit set but
+		 * its DVA doesn't match any phys in the entry.  This can
+		 * happen when a DVA was evicted from the DDT and re-added
+		 * on a hash collision.  The block may still have a BRT entry.
+		 */
+		if (v == DDT_PHYS_NONE) {
+			ddt_exit(ddt);
+			goto ddt_done;
+		}
+
+		ddt_block = B_TRUE;
+
 		/*
 		 * This entry may have multiple sets of DVAs. We must claim
 		 * each set the first time we see them in a real block on disk,
@@ -6431,8 +6433,14 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			dde->dde_io =
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
-		/* Consume a reference for this block. */
-		if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
+		/*
+		 * Consume a reference.  If this variant's refcount is already
+		 * zero, the DDT tracking is exhausted — more filesystem
+		 * references exist than the DDT accounts for.
+		 */
+		boolean_t ddt_refcnt_exhausted =
+		    (ddt_phys_refcnt(dde->dde_phys, v) == 0);
+		if (!ddt_refcnt_exhausted)
 			ddt_phys_decref(dde->dde_phys, v);
 
 		/*
@@ -6461,20 +6469,21 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			bp = &tempbp;
 		}
 
-		if (seen) {
+		if (seen && !ddt_refcnt_exhausted) {
 			/*
 			 * The second or later time we see this block,
 			 * it's a duplicate and we count it.
 			 */
 			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_dedup_blocks++;
-
-			/* Already claimed, don't do it again. */
-			do_claim = B_FALSE;
+			claimed = B_TRUE;
 		}
 
 		ddt_exit(ddt);
-	} else if (zcb->zcb_brt_is_active &&
+	}
+
+ddt_done:
+	if (!claimed && zcb->zcb_brt_is_active &&
 	    brt_maybe_exists(zcb->zcb_spa, bp)) {
 		/*
 		 * Cloned blocks are special. We need to count them, so we can
@@ -6482,10 +6491,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		 * only claim them once.
 		 *
 		 * To do this, we keep our own in-memory BRT. For each block
-		 * we haven't seen before, we look it up in the real BRT and
-		 * if its there, we note it and its refcount then proceed as
-		 * normal. If we see the block again, we count it as a clone
-		 * and then give it no further consideration.
+		 * we haven't seen before, we look it up in the real BRT. If
+		 * we see the block again, we count it as a clone.
 		 */
 		zdb_brt_entry_t zbre_search, *zbre;
 		avl_index_t where;
@@ -6493,36 +6500,27 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		zbre_search.zbre_dva = bp->blk_dva[0];
 		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
 		if (zbre == NULL) {
-			/* Not seen before; track it */
 			uint64_t refcnt =
 			    brt_entry_get_refcount(zcb->zcb_spa, bp);
 			if (refcnt > 0) {
+				brt_block = B_TRUE;
 				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
 				    UMEM_NOFAIL);
 				zbre->zbre_dva = bp->blk_dva[0];
 				zbre->zbre_refcount = refcnt;
 				avl_insert(&zcb->zcb_brt, zbre, where);
 			}
-		} else  {
-			/*
-			 * Second or later occurrence, count it and take a
-			 * refcount.
-			 */
-			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
-			zcb->zcb_clone_blocks++;
-
-			zbre->zbre_refcount--;
-			if (zbre->zbre_refcount == 0) {
-				avl_remove(&zcb->zcb_brt, zbre);
-				umem_free(zbre, sizeof (zdb_brt_entry_t));
+		} else {
+			brt_block = B_TRUE;
+			if (zbre->zbre_refcount > 0) {
+				zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+				zcb->zcb_clone_blocks++;
+				zbre->zbre_refcount--;
+				claimed = B_TRUE;
 			}
-
-			/* Already claimed, don't do it again. */
-			do_claim = B_FALSE;
 		}
 	}
 
-skipped:
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -6681,12 +6679,21 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 #undef BIN
 
 hist_skipped:
-	if (!do_claim)
+	if (claimed || dump_opt['L'])
 		return;
 
-	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	int claim_err = zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
-	    ZIO_FLAG_CANFAIL)));
+	    ZIO_FLAG_CANFAIL));
+	if (claim_err != 0) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("block claim error %d%s%s: %s\n",
+		    claim_err, brt_block ? " (BRT)" : "",
+		    ddt_block ? " (DDT)" : "", blkbuf);
+		zcb->zcb_haderrors = 1;
+		zcb->zcb_errors[claim_err]++;
+	}
 }
 
 static void
@@ -7462,10 +7469,66 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
-	if (dump_opt['L'])
-		return (B_FALSE);
-
 	boolean_t leaks = B_FALSE;
+
+	/*
+	 * Report leaked BRT entries whose refcount was not fully consumed by
+	 * the traversal.
+	 */
+	if (zcb->zcb_brt_is_active) {
+		void *cookie = NULL;
+		zdb_brt_entry_t *zbre;
+		while ((zbre = avl_destroy_nodes(
+		    &zcb->zcb_brt, &cookie)) != NULL) {
+			if (!dump_opt['L'] && zbre->zbre_refcount != 0) {
+				(void) printf("BRT leak: vdev %llu, "
+				    "offset 0x%llx, refcount %llu\n",
+				    (u_longlong_t)DVA_GET_VDEV(
+				    &zbre->zbre_dva),
+				    (u_longlong_t)DVA_GET_OFFSET(
+				    &zbre->zbre_dva),
+				    (u_longlong_t)zbre->zbre_refcount);
+				leaks = B_TRUE;
+			}
+			umem_free(zbre, sizeof (zdb_brt_entry_t));
+		}
+		avl_destroy(&zcb->zcb_brt);
+	}
+
+	if (dump_opt['L'])
+		return (leaks);
+
+	/*
+	 * Report leaked DDT entries whose refcount was not fully consumed by
+	 * the traversal.  Entries in the DDT ZAP that were never looked up
+	 * are not detected here.
+	 */
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL)
+			continue;
+		ddt_enter(ddt);
+		for (ddt_entry_t *dde = avl_first(&ddt->ddt_tree); dde != NULL;
+		    dde = AVL_NEXT(&ddt->ddt_tree, dde)) {
+			for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+				ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+				uint64_t refcnt = ddt_phys_refcnt(dde->dde_phys,
+				    v);
+				if (refcnt == 0)
+					continue;
+				blkptr_t blk;
+				char blkbuf[BP_SPRINTF_LEN];
+				ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
+				    dde->dde_phys, v, &blk);
+				snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
+				(void) printf("DDT leak: refcount %llu %s\n",
+				    (u_longlong_t)refcnt, blkbuf);
+				leaks = B_TRUE;
+			}
+		}
+		ddt_exit(ddt);
+	}
+
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh
index 01e9cf49dc8..cda4b0ee953 100755
--- a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh
@@ -51,4 +51,7 @@ log_must zfs set recordsize=$RECORDSIZE $TESTDSTFS
 
 bclone_corner_cases_test $TESTSRCDIR $TESTDSTDIR
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass
diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh
index e1b583813f1..0d2c0f6e16c 100755
--- a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh
@@ -50,4 +50,7 @@ for filesize in 1 107 113 511 512 513 4095 4096 4097 131071 131072 131073 \
     bclone_test random $filesize false $TESTSRCDIR $TESTDSTDIR
 done
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass
diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh
index d18a1bd2490..619fc3e4216 100755
--- a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh
@@ -45,4 +45,7 @@ log_must zfs set recordsize=$RECORDSIZE $TESTSRCFS
 
 bclone_corner_cases_test $TESTSRCDIR $TESTSRCDIR
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass
diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh
index 45551e04646..f1f80a9c059 100755
--- a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh
@@ -46,4 +46,7 @@ for filesize in 1 107 113 511 512 513 4095 4096 4097 131071 131072 131073 \
     bclone_test random $filesize false $TESTSRCDIR $TESTSRCDIR
 done
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh
index b407d4c541d..d4b7f01e8ba 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh
@@ -57,5 +57,9 @@ log_must zfs create $TESTPOOL/$TESTFS
 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/$TESTFS/file
 log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=16M count=2
 log_must zfs destroy -r $TESTPOOL/$TESTFS
+wait_freeing $TESTPOOL
+sync_pool $TESTPOOL
+
+log_must zdb -b $TESTPOOL
 
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh
index 4c652923545..7c183234922 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh
@@ -83,5 +83,8 @@ typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 $TESTPOOL/$TESTFS file2
 # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1).
 log_must [ "$blocks" = "$(seq -s " " 0 1021 | sed 's/ $//')" ]
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass "LWB buffer overflow is not triggered with multiple VDEVs ZIL"
 
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh
index 2e854d7e543..ad24c1f06ba 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh
@@ -126,4 +126,7 @@ typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \
 # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1).
 log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ]
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh
index eb1464ff4d4..6b9ea354226 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh
@@ -128,4 +128,7 @@ typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \
 # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1).
 log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ]
 
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
 log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
index 1a82e5d30a1..6e67a46b040 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -104,4 +104,6 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
 # logical table now destroyed; containing object destroyed
 log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0
 
+log_must zdb -b $TESTPOOL
+
 log_pass "basic dedup (FDT) operations work"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
index 5f6eb7c3400..3a90d656d00 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -117,4 +117,6 @@ obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
 # with only one ZAP inside
 log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
 
+log_must zdb -b $TESTPOOL
+
 log_pass "dedup (FDT) retains version after import"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
index 8028e4f0884..1fc598c5dd2 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
@@ -107,4 +107,6 @@ log_entries3=$(get_ddt_log_entries)
 # Verify there are 256 entries in the unique table.
 log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=256'"
 
+log_must zdb -b $TESTPOOL
+
 log_pass "dedup (FDT) paces out log entries appropriately"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
index 3348614cb74..4422502452b 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
@@ -93,4 +93,6 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
 # logical table now destroyed; all DDT ZAPs removed
 log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
 
+log_must zdb -b $TESTPOOL
+
 log_pass "basic dedup (legacy) operations work"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
index c962efaa7c5..b51eae2ad08 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -102,4 +102,6 @@ log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | wc -l) -eq 1
 obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | awk '{ print $NF }')
 log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-.*-zap- | wc -l) -eq 1
 
+log_must zdb -b $TESTPOOL
+
 log_pass "legacy and FDT dedup tables on the same pool can happily coexist"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
index 94f009fc0d0..ece43036c07 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -127,4 +127,6 @@ obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
 # with one ZAP inside
 log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
 
+log_must zdb -b $TESTPOOL
+
 log_pass "legacy dedup tables work after upgrade; new dedup tables created as FDT"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
index 9f6b1ef12a9..550f51cdb82 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
@@ -102,4 +102,6 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'"
 # should be just one DDT ZAP in the MOS
 log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
 
+log_must zdb -b $TESTPOOL
+
 log_pass "dedup (legacy) retains version after import"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
index 6b4937cc4a2..c0a2adb30c2 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
@@ -95,5 +95,6 @@ new_entries=$(ddt_entries)
 [[ "$((entries / 4))" -eq "$new_entries" ]] || \
 	log_fail "DDT entries did not shrink enough: $entries -> $new_entries"
 
+log_must zdb -b $TESTPOOL
 
 log_pass "DDT pruning correctly removes non-duplicate entries"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh
index 597bad253ec..41586204333 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh
@@ -83,4 +83,6 @@ log_must zpool import $TESTPOOL
 nleafs=$(zdb -dddd $TESTPOOL "$zap_obj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
 log_must test $nleafs -lt $nleafs_old
 
+log_must zdb -b $TESTPOOL
+
 log_pass "ZAP object shrank after removing entries."

From c7cfe0805cd99232249d7336fbdbd07a20a9ea16 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Sat, 9 May 2026 03:01:47 +0500
Subject: [PATCH 010/129] zarcstat: detect attached L2ARC device with no data

zarcstat and zarcsummary detected L2ARC presence using the l2_size
kstat, which is data held in L2ARC, not whether a cache device is
attached. When a cache device was attached but empty (freshly added,
or fully evicted):

  - zarcstat rejected "-f l2*" with "Incompatible field specified!"
  - zarcsummary printed "L2ARC not detected, skipping section",
    hiding cumulative I/O history and health counters

Expose the existing l2arc_ndev counter as a new kstat l2_dev_count.
It is maintained by l2arc_add_vdev() and l2arc_remove_vdev(), so it
tracks attachment in real time. Use it in both tools, falling back to
l2_size for compatibility with older kernel modules.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #18499
---
 cmd/zarcstat.in        | 6 +++---
 cmd/zarcsummary        | 5 ++++-
 include/sys/arc_impl.h | 2 ++
 module/zfs/arc.c       | 2 ++
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmd/zarcstat.in b/cmd/zarcstat.in
index 8ffd2048116..ad0e12e9fbf 100755
--- a/cmd/zarcstat.in
+++ b/cmd/zarcstat.in
@@ -565,10 +565,10 @@ def init():
 
     update_hdr_intr()
 
-    # check if L2ARC exists
+    # check if L2ARC exists; fall back to l2_size for older kernels that
+    # do not export l2_ndev
     snap_stats()
-    l2_size = cur.get("l2_size")
-    if l2_size:
+    if cur.get("l2_ndev") or cur.get("l2_size"):
         l2exist = True
 
     if desired_cols:
diff --git a/cmd/zarcsummary b/cmd/zarcsummary
index 24a129d9ca7..5b6e35465ea 100755
--- a/cmd/zarcsummary
+++ b/cmd/zarcsummary
@@ -856,7 +856,10 @@ def section_l2arc(kstats_dict):
     # The L2ARC statistics live in the same section as the normal ARC stuff
     arc_stats = isolate_section('arcstats', kstats_dict)
 
-    if arc_stats['l2_size'] == '0':
+    # Skip the section only when no cache device is attached. Fall back to
+    # l2_size for older kernels that do not export l2_ndev.
+    if arc_stats.get('l2_ndev', '0') == '0' and \
+            arc_stats['l2_size'] == '0':
         print('L2ARC not detected, skipping section\n')
         return
 
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index dbe712e2e73..8fde5c4fe50 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -832,6 +832,8 @@ typedef struct arc_stats {
 	 * due to ARC_FLAG_UNCACHED being set.
 	 */
 	kstat_named_t arcstat_uncached_evictable_metadata;
+	/* Number of L2ARC devices currently attached across all pools. */
+	kstat_named_t arcstat_l2_ndev;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	/*
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 053314db185..22b189d5bb8 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -586,6 +586,7 @@ arc_stats_t arc_stats = {
 	{ "uncached_metadata",		KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
+	{ "l2_ndev",			KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
@@ -7440,6 +7441,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
 	    aggsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
+	as->arcstat_l2_ndev.value.ui64 = l2arc_ndev;
 	as->arcstat_l2_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_hits);
 	as->arcstat_l2_misses.value.ui64 =

From 9ae9f2e9831d121d80871eddb08a80c625ae99d3 Mon Sep 17 00:00:00 2001
From: Gality <68463495+Gality369@users.noreply.github.com>
Date: Sat, 9 May 2026 06:08:21 +0800
Subject: [PATCH 011/129] Linux: annotate nested xattr setattr znode locks

zfs_setattr() updates both the target znode and its hidden xattr
directory when ownership, mode, or project ID changes. The xattr
directory uses the same z_acl_lock and z_lock classes as the
parent znode, so lockdep reports recursive locking when the
second znode's mutexes are acquired.

This is a lockdep false positive rather than a real deadlock.
attrzp is the target file's hidden xattr directory, and the code
does not acquire these znode mutexes in the reverse order.
Acquire the attrzp mutexes with mutex_enter_nested() so lockdep
treats them as nested.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
Co-authored-by: gality369 <gality369@example.com>
Closes #18506
---
 module/os/linux/zfs/zfs_vnops_os.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index e65f8123012..1e1e663b1f7 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -2434,9 +2434,13 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
+		/*
+		 * attrzp is zp's hidden xattr directory, so the second
+		 * znode lock acquisition is nested rather than recursive.
+		 */
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
-			mutex_enter(&attrzp->z_acl_lock);
-		mutex_enter(&attrzp->z_lock);
+			mutex_enter_nested(&attrzp->z_acl_lock, NESTED_SINGLE);
+		mutex_enter_nested(&attrzp->z_lock, NESTED_SINGLE);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));

From 45dddc4523c75fe41db71b3a85f30f08b92b0fbe Mon Sep 17 00:00:00 2001
From: Mateusz Piotrowski <0mp@FreeBSD.org>
Date: Mon, 11 May 2026 21:04:58 +0200
Subject: [PATCH 012/129] zfs.4: Fix documentation of
 zfs_arc_dnode_reduce_percent

Fixes: 25458cbef Limit the amount of dnode metadata in the ARC
Fixes: 5b9f3b766 Soften pruning threshold on not evictable metadata

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org>
Closes #18513
---
 man/man4/zfs.4 | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index c1fe65d2ad9..29fdbd3eb44 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -4,6 +4,7 @@
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
 .\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
+.\" Copyright (c) 2026, Mateusz Piotrowski <0mp@FreeBSD.org>
 .\"
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
@@ -18,7 +19,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd September 15, 2025
+.Dd May 8, 2026
 .Dt ZFS 4
 .Os
 .
@@ -768,9 +769,15 @@ See also
 which serves a similar purpose but has a higher priority if nonzero.
 .
 .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64
-Percentage of ARC dnodes to try to scan in response to demand for non-metadata
-when the number of bytes consumed by dnodes exceeds
-.Sy zfs_arc_dnode_limit .
+Percentage used to size dnode prune requests.
+The request size is the larger of two values:
+.Sy zfs_arc_dnode_reduce_percent
+applied to the dnode count above
+.Sy zfs_arc_dnode_limit ,
+or
+.Sy zfs_arc_dnode_reduce_percent
+applied to the total dnode count
+when non-evictable metadata exceeds 3/4 of the metadata target.
 .
 .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint
 The ARC's buffer hash table is sized based on the assumption of an average

From 35853ac849d9ba36f8a414cfd9780b1dfb8ee87a Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Mon, 11 May 2026 21:16:48 +0200
Subject: [PATCH 013/129] CI: skip qemu matrix for documentation-only pull
 requests

Add a new "docs" CI type, selected when every file modified by a
pull request matches a documentation pattern (man pages, .md,
AUTHORS, COPYRIGHT, LICENSE, NOTICE, .gitignore). For this type the
os_selection is empty and the qemu matrix runs no jobs.

This affects only pull requests whose entire diff is documentation.
Any change touching a non-documentation file continues to be
classified as full, quick, linux, or freebsd by the existing
file-path rules, and a manual ZFS-CI-Type commit tag still overrides
that classification.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18518
---
 .github/workflows/scripts/generate-ci-type.py | 22 +++++++++++++++++++
 .github/workflows/zfs-qemu.yml                |  4 ++++
 2 files changed, 26 insertions(+)

diff --git a/.github/workflows/scripts/generate-ci-type.py b/.github/workflows/scripts/generate-ci-type.py
index b1910ab630a..08f0c0fcc9a 100755
--- a/.github/workflows/scripts/generate-ci-type.py
+++ b/.github/workflows/scripts/generate-ci-type.py
@@ -6,6 +6,9 @@
 Output format: "<type> <source>" where source is "manual" (from
 ZFS-CI-Type commit tag) or "auto" (from file change heuristics).
 
+Prints "docs auto" if every changed file is documentation; the qemu
+matrix is skipped in that case.
+
 Prints "quick manual" if:
 - the *last* commit message contains 'ZFS-CI-Type: quick'
 or "quick auto" if (heuristics):
@@ -28,6 +31,19 @@
     r'.*\.gitignore'
 ]))
 
+"""
+Patterns of files that are documentation only.
+"""
+DOCS_ONLY_REGEX = list(map(re.compile, [
+    r'man/.*',
+    r'.*\.md',
+    r'AUTHORS',
+    r'COPYRIGHT',
+    r'LICENSE',
+    r'NOTICE',
+    r'\.gitignore',
+]))
+
 """
 Patterns of files that are considered to trigger full CI.
 """
@@ -116,6 +132,12 @@ def output_type(type, source, reason):
                         f'changed file "{f}" matches pattern "{r.pattern}"'
                         )
 
+    if changed_files and all(
+            any(r.match(f) for r in DOCS_ONLY_REGEX)
+            for f in changed_files):
+        output_type('docs', 'auto',
+                    'all changed files are documentation')
+
     # catch-all
     output_type('quick', 'auto',
                 'no changed file matches full CI patterns')
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 4b4fd27543f..f07988f701d 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -45,6 +45,9 @@ jobs:
           fi
 
           case "$ci_type" in
+          docs)
+            os_selection='[]'
+            ;;
           quick)
             os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd15-1s", "ubuntu24"]'
             ;;
@@ -91,6 +94,7 @@ jobs:
   qemu-vm:
     name: qemu-x86
     needs: [ test-config ]
+    if: needs.test-config.outputs.ci_type != 'docs'
     strategy:
       fail-fast: false
       matrix:

From 968f4db039f07a6cac32df677301d12025ede873 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Mon, 11 May 2026 21:19:28 +0200
Subject: [PATCH 014/129] zpool-attach.8: add EXAMPLES section

Mirror-attach (shared with zpool.8 example 5) and raidz expansion.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18508
---
 man/man8/zpool-attach.8 | 31 ++++++++++++++++++++++++++++++-
 man/man8/zpool.8        |  1 +
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8
index 04996ed4fa1..8394a5efba6 100644
--- a/man/man8/zpool-attach.8
+++ b/man/man8/zpool-attach.8
@@ -27,7 +27,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd November 8, 2023
+.Dd May 9, 2026
 .Dt ZPOOL-ATTACH 8
 .Os
 .
@@ -132,6 +132,35 @@ Waits until
 has finished resilvering or expanding before returning.
 .El
 .
+.Sh EXAMPLES
+.\" Example 1 is example 5 from zpool.8.
+.\" Make sure to update them bidirectionally
+.Ss Example 1 : No Making a non-mirrored ZFS Storage Pool mirrored
+The following command converts an existing single device
+.Ar sda
+into a mirror by attaching a second device to it,
+.Ar sdb .
+.Dl # Nm zpool Cm attach Ar tank Pa sda sdb
+.
+.Ss Example 2 : No Expanding a RAID-Z vdev with an additional disk
+The following command adds
+.Ar sdg
+to the existing
+.Ar raidz2-0
+vdev in
+.Ar tank ,
+turning a 6-wide RAID-Z2 into a 7-wide RAID-Z2:
+.Dl # Nm zpool Cm attach Ar tank raidz2-0 Pa sdg
+Progress is reported by
+.Nm zpool Cm status .
+The operation requires the
+.Sy raidz_expansion
+pool feature, and
+.Ar sdg
+must be at least as large as the smallest existing disk in the vdev.
+Old blocks keep their original data-to-parity ratio; only blocks written
+after the expansion use the new ratio.
+.
 .Sh SEE ALSO
 .Xr zpool-add 8 ,
 .Xr zpool-detach 8 ,
diff --git a/man/man8/zpool.8 b/man/man8/zpool.8
index 4b07f96bbcb..25dff473c30 100644
--- a/man/man8/zpool.8
+++ b/man/man8/zpool.8
@@ -245,6 +245,7 @@ Invalid command line options were specified.
 .
 .Sh EXAMPLES
 .\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8.
+.\" Example 5 is shared with zpool-attach.8.
 .\" Examples 6, 14 are shared with zpool-add.8.
 .\" Examples 7, 16 are shared with zpool-list.8.
 .\" Examples 8 are shared with zpool-destroy.8.

From d50f5b6d0b9e80d9974eddb1510bd0518509419d Mon Sep 17 00:00:00 2001
From: Gality <68463495+Gality369@users.noreply.github.com>
Date: Tue, 12 May 2026 04:13:28 +0800
Subject: [PATCH 015/129] dsl_dir: avoid dd_lock during snapshots_changed
 updates

Avoid holding dd_lock while updating the on-disk
snapshots_changed timestamp.

Both dsl_dir_zapify() and zap_update() may dirty buffers
and recurse into space accounting, which can take dd_lock.
Holding dd_lock across either operation can therefore
preserve the lock-order inversion reported by lockdep.

Only protect the in-memory dd_snap_cmtime update
with dd_lock. Perform the zapify and ZAP update without
dd_lock held, and retry the on-disk write if another updater
advanced dd_snap_cmtime while the write was in progress.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
Co-authored-by: gality369 <gality369@example.com>
Closes #18472
---
 module/zfs/dsl_dir.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index 335b11dc2ff..e88de3dbdfd 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -2304,22 +2304,29 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	inode_timespec_t t;
+
+	ASSERT(dsl_pool_sync_context(dp));
 	gethrestime(&t);
 
 	mutex_enter(&dd->dd_lock);
 	dd->dd_snap_cmtime = t;
-	if (spa_feature_is_enabled(dp->dp_spa,
-	    SPA_FEATURE_EXTENSIBLE_DATASET)) {
-		objset_t *mos = dd->dd_pool->dp_meta_objset;
-		uint64_t ddobj = dd->dd_object;
-		dsl_dir_zapify(dd, tx);
-		VERIFY0(zap_update(mos, ddobj,
-		    DD_FIELD_SNAPSHOTS_CHANGED,
-		    sizeof (uint64_t),
-		    sizeof (inode_timespec_t) / sizeof (uint64_t),
-		    &t, tx));
-	}
 	mutex_exit(&dd->dd_lock);
+
+	if (!spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_EXTENSIBLE_DATASET)) {
+		return;
+	}
+
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+
+	/*
+	 * dsl_dir_zapify() and zap_update() may dirty buffers and recurse
+	 * into space accounting, so do not call them with dd_lock held.
+	 */
+	dsl_dir_zapify(dd, tx);
+	VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_SNAPSHOTS_CHANGED,
+	    sizeof (uint64_t),
+	    sizeof (inode_timespec_t) / sizeof (uint64_t), &t, tx));
 }
 
 void

From 8ff64005a2acc48fe18a5bfa3edb591376ce0bd4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 8 May 2026 21:31:13 +1000
Subject: [PATCH 016/129] zap: split implementation out into more files

The ZAP code is mixed up across a few files without clear separation of
concerns. This splits it out from three source files to five:

- zap.c: the bulk of the "public" interface
- zap_impl.c: internals shared across all backends
- zap_micro.c: microzap backend
- zap_fat.c: fatzap backend: core logic
- zap_leaf.c: fatzap backend: leaf blocks

Note that this doesn't not change any code, just moves functions around.
Also note that right now the microzap and fatzap backends know more
about each other than is healthy. This change is simply marking out
where different things should live in the end, to make it easier for
that refactoring work to begin.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18516
---
 include/sys/zap_impl.h   |   23 +-
 lib/libzpool/Makefile.am |    2 +
 module/Kbuild.in         |    2 +
 module/Makefile.bsd      |    4 +-
 module/zfs/zap.c         | 2511 ++++++++++++++++----------------------
 module/zfs/zap_fat.c     | 1502 +++++++++++++++++++++++
 module/zfs/zap_impl.c    |  527 ++++++++
 module/zfs/zap_micro.c   | 1602 +-----------------------
 8 files changed, 3135 insertions(+), 3038 deletions(-)
 create mode 100644 module/zfs/zap_fat.c
 create mode 100644 module/zfs/zap_impl.c

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index d010c3c305c..78c57e522bc 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -203,17 +203,38 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp);
+int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+    zap_t **zapp);
+int zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap, const void *tag);
 void zap_evict_sync(void *dbu);
+zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key,
+    int numints);
 zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
+int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt);
+zap_name_t * zap_name_alloc(zap_t *zap, boolean_t longname);
 void zap_name_free(zap_name_t *zn);
 int zap_hashbits(zap_t *zap);
 uint32_t zap_maxcd(zap_t *zap);
 uint64_t zap_getflags(zap_t *zap);
+int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
+    size_t outlen);
+uint64_t zap_hash(zap_name_t *zn);
 
 uint64_t zap_get_micro_max_size(spa_t *spa);
 
-#define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+zap_t *mzap_open(dmu_buf_t *db);
+int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx,
+    zap_flags_t flags);
+mzap_ent_t *mze_find(zap_name_t *zn, zfs_btree_index_t *idx);
+boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash);
+void mze_destroy(zap_t *zap);
+boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn,
+    mzap_ent_t *mze, zfs_btree_index_t *idx);
+void mzap_addent(zap_name_t *zn, uint64_t value);
+void mzap_byteswap(mzap_phys_t *buf, size_t size);
 
 void fzap_byteswap(void *buf, size_t size);
 int fzap_count(zap_t *zap, uint64_t *count);
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 8192553072f..05105407d52 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -166,6 +166,8 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/vdev_root.c \
 	module/zfs/vdev_trim.c \
 	module/zfs/zap.c \
+	module/zfs/zap_fat.c \
+	module/zfs/zap_impl.c \
 	module/zfs/zap_leaf.c \
 	module/zfs/zap_micro.c \
 	module/zfs/zcp.c \
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 47e739ea4d6..ff2c96b85ae 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -408,6 +408,8 @@ ZFS_OBJS := \
 	vdev_root.o \
 	vdev_trim.o \
 	zap.o \
+	zap_fat.o \
+	zap_impl.o \
 	zap_leaf.o \
 	zap_micro.o \
 	zcp.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 30cf741b965..96c3f3b2418 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -345,6 +345,8 @@ SRCS+=	abd.c \
 	vdev_root.c \
 	vdev_trim.c \
 	zap.c \
+	zap_fat.c \
+	zap_impl.c \
 	zap_leaf.c \
 	zap_micro.c \
 	zcp.c \
@@ -475,8 +477,8 @@ CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
 CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
 CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
 CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.zap_impl.c= -Wno-cast-qual
 CFLAGS.zap_leaf.c= -Wno-cast-qual
-CFLAGS.zap_micro.c= -Wno-cast-qual
 CFLAGS.zcp.c= -Wno-cast-qual
 CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index b40d765e342..fa3f8b836c9 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -19,1075 +19,116 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
- * Copyright (c) 2023, Klara Inc.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2024, Klara, Inc.
  */
 
-/*
- * This file contains the top half of the zfs directory structure
- * implementation. The bottom half is in zap_leaf.c.
- *
- * The zdir is an extendable hash data structure. There is a table of
- * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
- * each a constant size and hold a variable number of directory entries.
- * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
- *
- * The pointer table holds a power of 2 number of pointers.
- * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
- * by the pointer at index i in the table holds entries whose hash value
- * has a zd_prefix_len - bit prefix
- */
-
-#include <sys/spa.h>
+#include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_znode.h>
-#include <sys/fs/zfs.h>
+#include <sys/btree.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
-/*
- * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
- * (all leaf blocks) when we start iterating over it.
- *
- * For zap_cursor_init(), the callers all intend to iterate through all the
- * entries.  There are a few cases where an error (typically i/o error) could
- * cause it to bail out early.
- *
- * For zap_cursor_init_serialized(), there are callers that do the iteration
- * outside of ZFS.  Typically they would iterate over everything, but we
- * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
- * zcp_snapshots_iter(), and other iterators over things in the MOS - these
- * are called by /sbin/zfs and channel programs.  The other example is
- * zfs_readdir() which iterates over directory entries for the getdents()
- * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
- * userland doesn't have to.
- *
- * Given that the ZAP entries aren't returned in a specific order, the only
- * legitimate use cases for partial iteration would be:
- *
- * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
- *    get the first 100 and then wait for the user to hit "next page", which
- *    they may never do).
- *
- * 2. You want to know if there are more than X entries, without relying on
- *    the zfs-specific implementation of the directory's st_size (which is
- *    the number of entries).
- */
-static int zap_iterate_prefetch = B_TRUE;
-
-/*
- * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
- * collapsed into a single block.
- */
-int zap_shrink_enabled = B_TRUE;
-
-int fzap_default_block_shift = 14; /* 16k blocksize */
-
-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
-static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
-
-void
-fzap_byteswap(void *vbuf, size_t size)
-{
-	uint64_t block_type = *(uint64_t *)vbuf;
-
-	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
-		zap_leaf_byteswap(vbuf, size);
-	else {
-		/* it's a ptrtbl block */
-		byteswap_uint64_array(vbuf, size);
-	}
-}
-
-void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
-{
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	zap->zap_ismicro = FALSE;
-
-	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
-	zap->zap_dbu.dbu_evict_func_async = NULL;
-
-	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0);
-	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
-
-	zap_phys_t *zp = zap_f_phys(zap);
-	/*
-	 * explicitly zero it since it might be coming from an
-	 * initialized microzap
-	 */
-	memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size);
-	zp->zap_block_type = ZBT_HEADER;
-	zp->zap_magic = ZAP_MAGIC;
-
-	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
-
-	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
-	zp->zap_num_leafs = 1;
-	zp->zap_num_entries = 0;
-	zp->zap_salt = zap->zap_salt;
-	zp->zap_normflags = zap->zap_normflags;
-	zp->zap_flags = flags;
-
-	/* block 1 will be the first leaf */
-	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
-		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
-
-	/*
-	 * set up block 1 - the first leaf
-	 */
-	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
-	dmu_buf_will_dirty(db, tx);
-
-	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-	l->l_dbuf = db;
-
-	zap_leaf_init(l, zp->zap_normflags != 0);
-
-	kmem_free(l, sizeof (zap_leaf_t));
-	dmu_buf_rele(db, FTAG);
-}
-
-static int
-zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
-{
-	if (RW_WRITE_HELD(&zap->zap_rwlock))
-		return (1);
-	if (rw_tryupgrade(&zap->zap_rwlock)) {
-		dmu_buf_will_dirty(zap->zap_dbuf, tx);
-		return (1);
-	}
-	return (0);
-}
-
-/*
- * Generic routines for dealing with the pointer & cookie tables.
- */
-
-static int
-zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
-    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
-    dmu_tx_t *tx)
-{
-	uint64_t newblk;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int hepb = 1<<(bs-4);
-	/* hepb = half the number of entries in a block */
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(tbl->zt_blk != 0);
-	ASSERT(tbl->zt_numblks > 0);
-
-	if (tbl->zt_nextblk != 0) {
-		newblk = tbl->zt_nextblk;
-	} else {
-		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
-		tbl->zt_nextblk = newblk;
-		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
-		    ZIO_PRIORITY_SYNC_READ);
-	}
-
-	/*
-	 * Copy the ptrtbl from the old to new location.
-	 */
-
-	uint64_t b = tbl->zt_blks_copied;
-	dmu_buf_t *db_old;
-	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-
-	/* first half of entries in old[b] go to new[2*b+0] */
-	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
-	dmu_buf_will_dirty(db_new, tx);
-	transfer_func(db_old->db_data, db_new->db_data, hepb);
-	dmu_buf_rele(db_new, FTAG);
-
-	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
-	dmu_buf_will_dirty(db_new, tx);
-	transfer_func((uint64_t *)db_old->db_data + hepb,
-	    db_new->db_data, hepb);
-	dmu_buf_rele(db_new, FTAG);
-
-	dmu_buf_rele(db_old, FTAG);
-
-	tbl->zt_blks_copied++;
-
-	dprintf("copied block %llu of %llu\n",
-	    (u_longlong_t)tbl->zt_blks_copied,
-	    (u_longlong_t)tbl->zt_numblks);
-
-	if (tbl->zt_blks_copied == tbl->zt_numblks) {
-		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
-
-		tbl->zt_blk = newblk;
-		tbl->zt_numblks *= 2;
-		tbl->zt_shift++;
-		tbl->zt_nextblk = 0;
-		tbl->zt_blks_copied = 0;
-
-		dprintf("finished; numblocks now %llu (%uk entries)\n",
-		    (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10));
-	}
-
-	return (0);
-}
-
-static int
-zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
-    dmu_tx_t *tx)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT(tbl->zt_blk != 0);
-
-	dprintf("storing %llx at index %llx\n", (u_longlong_t)val,
-	    (u_longlong_t)idx);
-
-	uint64_t blk = idx >> (bs-3);
-	uint64_t off = idx & ((1<<(bs-3))-1);
-
-	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-	dmu_buf_will_dirty(db, tx);
-
-	if (tbl->zt_nextblk != 0) {
-		uint64_t idx2 = idx * 2;
-		uint64_t blk2 = idx2 >> (bs-3);
-		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
-		dmu_buf_t *db2;
-
-		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
-		    DMU_READ_NO_PREFETCH);
-		if (err != 0) {
-			dmu_buf_rele(db, FTAG);
-			return (err);
-		}
-		dmu_buf_will_dirty(db2, tx);
-		((uint64_t *)db2->db_data)[off2] = val;
-		((uint64_t *)db2->db_data)[off2+1] = val;
-		dmu_buf_rele(db2, FTAG);
-	}
-
-	((uint64_t *)db->db_data)[off] = val;
-	dmu_buf_rele(db, FTAG);
-
-	return (0);
-}
-
-static int
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	uint64_t blk = idx >> (bs-3);
-	uint64_t off = idx & ((1<<(bs-3))-1);
-
-	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-	*valp = ((uint64_t *)db->db_data)[off];
-	dmu_buf_rele(db, FTAG);
-
-	if (tbl->zt_nextblk != 0) {
-		/*
-		 * read the nextblk for the sake of i/o error checking,
-		 * so that zap_table_load() will catch errors for
-		 * zap_table_store.
-		 */
-		blk = (idx*2) >> (bs-3);
-
-		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
-		    DMU_READ_NO_PREFETCH);
-		if (err == 0)
-			dmu_buf_rele(db, FTAG);
-	}
-	return (err);
-}
-
-/*
- * Routines for growing the ptrtbl.
- */
-
-static void
-zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
-{
-	for (int i = 0; i < n; i++) {
-		uint64_t lb = src[i];
-		dst[2 * i + 0] = lb;
-		dst[2 * i + 1] = lb;
-	}
-}
-
-static int
-zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
-{
-	/*
-	 * The pointer table should never use more hash bits than we
-	 * have (otherwise we'd be using useless zero bits to index it).
-	 * If we are within 2 bits of running out, stop growing, since
-	 * this is already an aberrant condition.
-	 */
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
-		return (SET_ERROR(ENOSPC));
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
-		/*
-		 * We are outgrowing the "embedded" ptrtbl (the one
-		 * stored in the header block).  Give it its own entire
-		 * block, which will double the size of the ptrtbl.
-		 */
-		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
-		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
-
-		uint64_t newblk = zap_allocate_blocks(zap, 1);
-		dmu_buf_t *db_new;
-		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
-		    DMU_READ_NO_PREFETCH);
-		if (err != 0)
-			return (err);
-		dmu_buf_will_dirty(db_new, tx);
-		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
-		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		dmu_buf_rele(db_new, FTAG);
-
-		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
-		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
-		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
-
-		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
-		    (FZAP_BLOCK_SHIFT(zap)-3));
-
-		return (0);
-	} else {
-		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
-		    zap_ptrtbl_transfer, tx));
-	}
-}
-
-static void
-zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
-{
-	dmu_buf_will_dirty(zap->zap_dbuf, tx);
-	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
-	zap_f_phys(zap)->zap_num_entries += delta;
-	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-}
-
 static uint64_t
-zap_allocate_blocks(zap_t *zap, int nblocks)
+zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
-	zap_f_phys(zap)->zap_freeblk += nblocks;
-	return (newblk);
-}
+	uint64_t obj;
 
-static void
-zap_leaf_evict_sync(void *dbu)
-{
-	zap_leaf_t *l = dbu;
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
 
-	rw_destroy(&l->l_rwlock);
-	kmem_free(l, sizeof (zap_leaf_t));
-}
-
-static zap_leaf_t *
-zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
-{
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	uint64_t blkid = zap_allocate_blocks(zap, 1);
-	dmu_buf_t *db = NULL;
-
-	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db,
-	    DMU_READ_NO_PREFETCH));
-
-	/*
-	 * Create the leaf structure and stash it on the dbuf. If zap was
-	 * recent shrunk or truncated, the dbuf might have been sitting in the
-	 * cache waiting to be evicted, and so still have the old leaf attached
-	 * to it. If so, just reuse it.
-	 */
-	zap_leaf_t *l = dmu_buf_get_user(db);
-	if (l == NULL) {
-		l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-		l->l_blkid = blkid;
-		l->l_dbuf = db;
-		rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
-		dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL,
-		    &l->l_dbuf);
-		dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
+	if (allocated_dnode == NULL) {
+		dnode_t *dn;
+		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+		    indirect_blockshift, bonustype, bonuslen, dnodesize,
+		    &dn, FTAG, tx);
+		mzap_create_impl(dn, normflags, flags, tx);
+		dnode_rele(dn, FTAG);
 	} else {
-		ASSERT3U(l->l_blkid, ==, blkid);
-		ASSERT3P(l->l_dbuf, ==, db);
+		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+		    indirect_blockshift, bonustype, bonuslen, dnodesize,
+		    allocated_dnode, tag, tx);
+		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
 	}
 
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	dmu_buf_will_dirty(l->l_dbuf, tx);
-
-	zap_leaf_init(l, zap->zap_normflags != 0);
-
-	zap_f_phys(zap)->zap_num_leafs++;
-
-	return (l);
+	return (obj);
 }
 
-int
-fzap_count(zap_t *zap, uint64_t *count)
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	ASSERT(!zap->zap_ismicro);
-	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-	*count = zap_f_phys(zap)->zap_num_entries;
-	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-	return (0);
+	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
+}
+
+uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+	    dnodesize, tx));
+}
+
+uint64_t
+zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+	    0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
+	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_flags_dnsize(os, normflags, flags, ot,
+	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
+	    tx));
 }
 
 /*
- * Routines for obtaining zap_leaf_t's
+ * Create a zap object and return a pointer to the newly allocated dnode via
+ * the allocated_dnode argument.  The returned dnode will be held and the
+ * caller is responsible for releasing the hold by calling dnode_rele().
  */
-
-void
-zap_put_leaf(zap_leaf_t *l)
+uint64_t
+zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
-	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf, NULL);
+	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+	    indirect_blockshift, bonustype, bonuslen, dnodesize,
+	    allocated_dnode, tag, tx));
 }
 
-static zap_leaf_t *
-zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
-{
-	ASSERT(blkid != 0);
-
-	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-	rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	l->l_blkid = blkid;
-	l->l_bs = highbit64(db->db_size) - 1;
-	l->l_dbuf = db;
-
-	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
-	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
-
-	rw_exit(&l->l_rwlock);
-	if (winner != NULL) {
-		/* someone else set it first */
-		zap_leaf_evict_sync(&l->l_dbu);
-		l = winner;
-	}
-
-	/*
-	 * lhr_pad was previously used for the next leaf in the leaf
-	 * chain.  There should be no chained leafs (as we have removed
-	 * support for them).
-	 */
-	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
-
-	/*
-	 * There should be more hash entries than there can be
-	 * chunks to put in the hash table
-	 */
-	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
-
-	/* The chunks should begin at the end of the hash table */
-	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
-	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
-
-	/* The chunks should end at the end of the block */
-	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
-
-	return (l);
-}
-
-static int
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
-    zap_leaf_t **lp)
-{
-	dmu_buf_t *db;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	/*
-	 * If system crashed just after dmu_free_long_range in zfs_rmnode, we
-	 * would be left with an empty xattr dir in delete queue. blkid=0
-	 * would be passed in when doing zfs_purgedir. If that's the case we
-	 * should just return immediately. The underlying objects should
-	 * already be freed, so this should be perfectly fine.
-	 */
-	if (blkid == 0)
-		return (SET_ERROR(ENOENT));
-
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-
-	ASSERT3U(db->db_object, ==, zap->zap_object);
-	ASSERT3U(db->db_offset, ==, blkid << bs);
-	ASSERT3U(db->db_size, ==, 1 << bs);
-	ASSERT(blkid != 0);
-
-	zap_leaf_t *l = dmu_buf_get_user(db);
-
-	if (l == NULL)
-		l = zap_open_leaf(blkid, db);
-
-	rw_enter(&l->l_rwlock, lt);
-	/*
-	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
-	 * causing ASSERT below to fail.
-	 */
-	if (lt == RW_WRITER)
-		dmu_buf_will_dirty(db, tx);
-	ASSERT3U(l->l_blkid, ==, blkid);
-	ASSERT3P(l->l_dbuf, ==, db);
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	*lp = l;
-	return (0);
-}
-
-static int
-zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
-{
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
-		ASSERT3U(idx, <,
-		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
-		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
-		return (0);
-	} else {
-		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
-		    idx, valp));
-	}
-}
-
-static int
-zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
-{
-	ASSERT(tx != NULL);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
-		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
-		return (0);
-	} else {
-		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
-		    idx, blk, tx));
-	}
-}
-
-static int
-zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
-    dmu_tx_t *tx)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int epb = bs >> 3; /* entries per block */
-	int err = 0;
-
-	ASSERT(tx != NULL);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	/*
-	 * Check for i/o errors
-	 */
-	for (int i = 0; i < nptrs; i += epb) {
-		uint64_t blk;
-		err = zap_idx_to_blk(zap, idx + i, &blk);
-		if (err != 0) {
-			return (err);
-		}
-	}
-
-	for (int i = 0; i < nptrs; i++) {
-		err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
-		ASSERT0(err); /* we checked for i/o errors above */
-		if (err != 0)
-			break;
-	}
-
-	return (err);
-}
-
-#define	ZAP_PREFIX_HASH(pref, pref_len)	((pref) << (64 - (pref_len)))
-
-/*
- * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
- * If two leaves are siblings, their ranges are adjecent and contain the same
- * number of entries. In order to find out if a leaf has a sibling, we need to
- * check the range corresponding to the sibling leaf. There is no need to check
- * all entries in the range, we only need to check the frist and the last one.
- */
-static uint64_t
-check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
-{
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
-	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
-	uint64_t nptrs = (1 << pref_diff);
-	uint64_t first;
-	uint64_t last;
-
-	ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
-
-	if (zap_idx_to_blk(zap, idx, &first) != 0)
-		return (0);
-
-	if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
-		return (0);
-
-	if (first != last)
-		return (0);
-	return (first);
-}
-
-static int
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
-{
-	uint64_t blk;
-
-	ASSERT(zap->zap_dbuf == NULL ||
-	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
-
-	/* Reality check for corrupt zap objects (leaf or header). */
-	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
-	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
-	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
-		return (SET_ERROR(EIO));
-	}
-
-	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	int err = zap_idx_to_blk(zap, idx, &blk);
-	if (err != 0)
-		return (err);
-	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
-
-	ASSERT(err ||
-	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
-	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
-	return (err);
-}
-
-static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
-    const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
-{
-	zap_t *zap = zn->zn_zap;
-	uint64_t hash = zn->zn_hash;
-	int err;
-	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-
-	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    zap_leaf_phys(l)->l_hdr.lh_prefix);
-
-	if (zap_tryupgradedir(zap, tx) == 0 ||
-	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
-		/* We failed to upgrade, or need to grow the pointer table */
-		objset_t *os = zap->zap_objset;
-		uint64_t object = zap->zap_object;
-
-		zap_put_leaf(l);
-		*lp = l = NULL;
-		zap_unlockdir(zap, tag);
-		err = zap_lockdir(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, tag, &zn->zn_zap);
-		zap = zn->zn_zap;
-		if (err != 0)
-			return (err);
-		ASSERT(!zap->zap_ismicro);
-
-		while (old_prefix_len ==
-		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
-			err = zap_grow_ptrtbl(zap, tx);
-			if (err != 0)
-				return (err);
-		}
-
-		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-		if (err != 0)
-			return (err);
-
-		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
-			/* it split while our locks were down */
-			*lp = l;
-			return (0);
-		}
-	}
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    zap_leaf_phys(l)->l_hdr.lh_prefix);
-
-	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
-	    (old_prefix_len + 1);
-	uint64_t sibling =
-	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
-
-	/* check for i/o errors before doing zap_leaf_split */
-	for (int i = 0; i < (1ULL << prefix_diff); i++) {
-		uint64_t blk;
-		err = zap_idx_to_blk(zap, sibling + i, &blk);
-		if (err != 0)
-			return (err);
-		ASSERT3U(blk, ==, l->l_blkid);
-	}
-
-	zap_leaf_t *nl = zap_create_leaf(zap, tx);
-	zap_leaf_split(l, nl, zap->zap_normflags != 0);
-
-	/* set sibling pointers */
-	for (int i = 0; i < (1ULL << prefix_diff); i++) {
-		err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
-		ASSERT0(err); /* we checked for i/o errors above */
-	}
-
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0);
-
-	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
-		/* we want the sibling */
-		zap_put_leaf(l);
-		*lp = nl;
-	} else {
-		zap_put_leaf(nl);
-		*lp = l;
-	}
-
-	return (0);
-}
-
-static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
-    const void *tag, dmu_tx_t *tx)
-{
-	zap_t *zap = zn->zn_zap;
-	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
-	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
-
-	zap_put_leaf(l);
-
-	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
-		/*
-		 * We are in the middle of growing the pointer table, or
-		 * this leaf will soon make us grow it.
-		 */
-		if (zap_tryupgradedir(zap, tx) == 0) {
-			objset_t *os = zap->zap_objset;
-			uint64_t zapobj = zap->zap_object;
-
-			zap_unlockdir(zap, tag);
-			int err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
-			zap = zn->zn_zap;
-			if (err != 0)
-				return;
-		}
-
-		/* could have finished growing while our locks were down */
-		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
-			(void) zap_grow_ptrtbl(zap, tx);
-	}
-}
-
-static int
-fzap_checkname(zap_name_t *zn)
-{
-	uint32_t maxnamelen = zn->zn_normbuf_len;
-	uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen;
-	/* Only allow directory zap to have longname */
-	if (len > maxnamelen ||
-	    (len > ZAP_MAXNAMELEN &&
-	    zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS))
-		return (SET_ERROR(ENAMETOOLONG));
-	return (0);
-}
-
-static int
-fzap_checksize(uint64_t integer_size, uint64_t num_integers)
-{
-	/* Only integer sizes supported by C */
-	switch (integer_size) {
-	case 1:
-	case 2:
-	case 4:
-	case 8:
-		break;
-	default:
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (integer_size * num_integers > ZAP_MAXVALUELEN)
-		return (SET_ERROR(E2BIG));
-
-	return (0);
-}
-
-static int
-fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
-{
-	int err = fzap_checkname(zn);
-	if (err != 0)
-		return (err);
-	return (fzap_checksize(integer_size, num_integers));
-}
-
-/*
- * Routines for manipulating attributes.
- */
-int
-fzap_lookup(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    char *realname, int rn_len, boolean_t *ncp,
-    uint64_t *actual_num_integers)
-{
-	zap_leaf_t *l;
-	zap_entry_handle_t zeh;
-
-	int err = fzap_checkname(zn);
-	if (err != 0)
-		return (err);
-
-	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err == 0) {
-		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
-			zap_put_leaf(l);
-			return (err);
-		}
-
-		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-		if (err == 0 && actual_num_integers != NULL)
-			*actual_num_integers = zeh.zeh_num_integers;
-		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
-		if (ncp) {
-			*ncp = zap_entry_normalization_conflict(&zeh,
-			    zn, NULL, zn->zn_zap);
-		}
-	}
-
-	zap_put_leaf(l);
-	return (err);
-}
-
-int
-fzap_add_cd(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	int err;
-	zap_entry_handle_t zeh;
-	zap_t *zap = zn->zn_zap;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT(!zap->zap_ismicro);
-	ASSERT0(fzap_check(zn, integer_size, num_integers));
-
-	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-retry:
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err == 0) {
-		err = SET_ERROR(EEXIST);
-		goto out;
-	}
-	if (err != ENOENT)
-		goto out;
-
-	err = zap_entry_create(l, zn, cd,
-	    integer_size, num_integers, val, &zeh);
-
-	if (err == 0) {
-		zap_increment_num_entries(zap, 1, tx);
-	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
-		if (err == 0)
-			goto retry;
-	}
-
-out:
-	if (l != NULL) {
-		if (err == ENOSPC)
-			zap_put_leaf(l);
-		else
-			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
-	}
-	return (err);
-}
-
-int
-fzap_add(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, const void *tag, dmu_tx_t *tx)
-{
-	int err = fzap_check(zn, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_NEED_CD, tag, tx));
-}
-
-int
-fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val,
-    const void *tag, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	int err;
-	boolean_t create;
-	zap_entry_handle_t zeh;
-	zap_t *zap = zn->zn_zap;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	err = fzap_check(zn, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-retry:
-	err = zap_leaf_lookup(l, zn, &zeh);
-	create = (err == ENOENT);
-	ASSERT(err == 0 || err == ENOENT);
-
-	if (create) {
-		err = zap_entry_create(l, zn, ZAP_NEED_CD,
-		    integer_size, num_integers, val, &zeh);
-		if (err == 0)
-			zap_increment_num_entries(zap, 1, tx);
-	} else {
-		err = zap_entry_update(&zeh, integer_size, num_integers, val);
-	}
-
-	if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
-		if (err == 0)
-			goto retry;
-	}
-
-	if (l != NULL) {
-		if (err == ENOSPC)
-			zap_put_leaf(l);
-		else
-			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
-	}
-	return (err);
-}
-
-int
-fzap_length(zap_name_t *zn,
-    uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_leaf_t *l;
-	int err;
-	zap_entry_handle_t zeh;
-
-	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err != 0)
-		goto out;
-
-	if (integer_size != NULL)
-		*integer_size = zeh.zeh_integer_size;
-	if (num_integers != NULL)
-		*num_integers = zeh.zeh_num_integers;
-out:
-	zap_put_leaf(l);
-	return (err);
-}
-
-int
-fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	int err;
-	zap_entry_handle_t zeh;
-
-	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err == 0) {
-		zap_entry_remove(&zeh);
-		zap_increment_num_entries(zn->zn_zap, -1, tx);
-
-		if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
-		    zap_shrink_enabled)
-			return (zap_shrink(zn, l, tx));
-	}
-	zap_put_leaf(l);
-	return (err);
-}
-
-void
-fzap_prefetch(zap_name_t *zn)
-{
-	uint64_t blk;
-	zap_t *zap = zn->zn_zap;
-
-	uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
-	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	if (zap_idx_to_blk(zap, idx, &blk) != 0)
-		return;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
-	    ZIO_PRIORITY_SYNC_READ);
-}
-
-/*
- * Helper functions for consumers.
- */
-
 uint64_t
 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
     const char *name, dmu_tx_t *tx)
@@ -1109,6 +150,777 @@ zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
 	return (new_obj);
 }
 
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+	    0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_claim_norm_dnsize(os, obj,
+	    0, ot, bonustype, bonuslen, dnodesize, tx));
+}
+
+int
+zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
+    dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+	    bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+    int dnodesize, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int error;
+
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+	    dnodesize, tx);
+	if (error != 0)
+		return (error);
+
+	error = dnode_hold(os, obj, FTAG, &dn);
+	if (error != 0)
+		return (error);
+
+	mzap_create_impl(dn, normflags, 0, tx);
+
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+	/*
+	 * dmu_object_free will free the object number and free the
+	 * data.  Freeing the data will cause our pageout function to be
+	 * called, which will destroy our data (zap_leaf_t's and zap_t).
+	 */
+
+	return (dmu_object_free(os, zapobj, tx));
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	int err = 0;
+
+	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
+	if (zn == NULL)
+		return (SET_ERROR(ENOTSUP));
+
+	if (!zap->zap_ismicro) {
+		err = fzap_lookup(zn, integer_size, num_integers, buf,
+		    realname, rn_len, ncp, NULL);
+	} else {
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
+		if (mze == NULL) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			if (num_integers < 1) {
+				err = SET_ERROR(EOVERFLOW);
+			} else if (integer_size != 8) {
+				err = SET_ERROR(EINVAL);
+			} else {
+				*(uint64_t *)buf =
+				    MZE_PHYS(zap, mze)->mze_value;
+				if (realname != NULL)
+					(void) strlcpy(realname,
+					    MZE_PHYS(zap, mze)->mze_name,
+					    rn_len);
+				if (ncp) {
+					*ncp = mzap_normalization_conflict(zap,
+					    zn, mze, &idx);
+				}
+			}
+		}
+	}
+	zap_name_free(zn);
+	return (err);
+}
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm(os, zapobj, name, integer_size,
+	    num_integers, buf, 0, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+static int
+zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
+    uint64_t *actual_num_integers)
+{
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	int err = fzap_lookup(zn, integer_size, num_integers, buf,
+	    NULL, 0, NULL, actual_num_integers);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, buf, NULL);
+	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, buf, NULL);
+	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
+    uint64_t *actual_num_integers)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, buf, actual_num_integers);
+	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+	int err = zap_lookup_norm(os, zapobj, name, 0,
+	    0, NULL, 0, NULL, 0, NULL);
+	if (err == EOVERFLOW || err == EINVAL)
+		err = 0; /* found, but skipped reading the value */
+	return (err);
+}
+
+int
+zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_str(zap, name, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	fzap_prefetch(zn);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_prefetch_object(objset_t *os, uint64_t zapobj)
+{
+	int error;
+	dmu_object_info_t doi;
+
+	error = dmu_object_info(os, zapobj, &doi);
+	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
+		error = SET_ERROR(EINVAL);
+	if (error == 0)
+		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
+
+	return (error);
+}
+
+
+static int
+zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
+{
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	fzap_prefetch(zn);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (0);
+}
+
+int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_prefetch_uint64_impl(zap, key, key_numints);
+	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_prefetch_uint64_impl(zap, key, key_numints);
+	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, 0, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+
+	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+static int
+zap_add_impl(zap_t *zap, const char *key,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	const uint64_t *intval = val;
+	int err = 0;
+
+	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	if (!zap->zap_ismicro) {
+		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+		zap = zn->zn_zap;	/* fzap_add() may change zap */
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(key) >= MZAP_NAME_LEN ||
+	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
+		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
+		if (err == 0) {
+			err = fzap_add(zn, integer_size, num_integers, val,
+			    tag, tx);
+		}
+		zap = zn->zn_zap;	/* fzap_add() may change zap */
+	} else {
+		zfs_btree_index_t idx;
+		if (mze_find(zn, &idx) != NULL) {
+			err = SET_ERROR(EEXIST);
+		} else {
+			mzap_addent(zn, *intval);
+		}
+	}
+	ASSERT(zap == zn->zn_zap);
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_by_dnode(dnode_t *dn, const char *key,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	const uint64_t *intval = val;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	if (!zap->zap_ismicro) {
+		err = fzap_update(zn, integer_size, num_integers, val,
+		    FTAG, tx);
+		zap = zn->zn_zap;	/* fzap_update() may change zap */
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(name) >= MZAP_NAME_LEN) {
+		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+		    (u_longlong_t)zapobj, integer_size,
+		    (u_longlong_t)num_integers, name);
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
+			err = fzap_update(zn, integer_size, num_integers,
+			    val, FTAG, tx);
+		}
+		zap = zn->zn_zap;	/* fzap_update() may change zap */
+	} else {
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
+		if (mze != NULL) {
+			MZE_PHYS(zap, mze)->mze_value = *intval;
+		} else {
+			mzap_addent(zn, *intval);
+		}
+	}
+	ASSERT(zap == zn->zn_zap);
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+    const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_update() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers, const void *val,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	if (!zap->zap_ismicro) {
+		err = fzap_length(zn, integer_size, num_integers);
+	} else {
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
+		if (mze == NULL) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			if (integer_size)
+				*integer_size = 8;
+			if (num_integers)
+				*num_integers = 1;
+		}
+	}
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_length(zn, integer_size, num_integers);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+
+	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_length(zn, integer_size, num_integers);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+static int
+zap_remove_impl(zap_t *zap, const char *name,
+    matchtype_t mt, dmu_tx_t *tx)
+{
+	int err = 0;
+
+	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
+	if (zn == NULL)
+		return (SET_ERROR(ENOTSUP));
+	if (!zap->zap_ismicro) {
+		err = fzap_remove(zn, tx);
+	} else {
+		zfs_btree_index_t idx;
+		mzap_ent_t *mze = mze_find(zn, &idx);
+		if (mze == NULL) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			zap->zap_m.zap_num_entries--;
+			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
+			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
+		}
+	}
+	zap_name_free(zn);
+	return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+	return (zap_remove_norm(os, zapobj, name, 0, tx));
+}
+
+int
+zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+    matchtype_t mt, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	err = zap_remove_impl(zap, name, mt, tx);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	err = zap_remove_impl(zap, name, 0, tx);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_count(zap, count);
+	} else {
+		*count = zap->zap_m.zap_num_entries;
+	}
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_count_by_dnode(dnode_t *dn, uint64_t *count)
+{
+	zap_t *zap;
+
+	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_count(zap, count);
+	} else {
+		*count = zap->zap_m.zap_num_entries;
+	}
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+/*
+ * Helper functions for consumers.
+ */
+
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name, uint64_t namelen)
@@ -1213,7 +1025,6 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 	zap_attribute_free(za);
 	return (err);
 }
-
 int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
@@ -1241,6 +1052,16 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_increment(os, obj, name, delta, tx));
+}
+
 int
 zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
@@ -1290,428 +1111,238 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
 	return (err);
 }
 
-int
-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
-	return (zap_increment(os, obj, name, delta, tx));
-}
-
 /*
  * Routines for iterating over the attributes.
  */
 
-int
-fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized, boolean_t prefetch)
 {
-	int err;
-	zap_entry_handle_t zeh;
-	zap_leaf_t *l;
-
-	/* retrieve the next entry at or after zc_hash/zc_cd */
-	/* if no entry, return ENOENT */
-
-	/*
-	 * If we are reading from the beginning, we're almost certain to
-	 * iterate over the entire ZAP object.  If there are multiple leaf
-	 * blocks (freeblk > 2), prefetch the whole object (up to
-	 * dmu_prefetch_max bytes), so that we read the leaf blocks
-	 * concurrently. (Unless noprefetch was requested via
-	 * zap_cursor_init_noprefetch()).
-	 */
-	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
-	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
-		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
-		    ZIO_PRIORITY_ASYNC_READ);
-	}
-
-	if (zc->zc_leaf) {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-
-		/*
-		 * The leaf was either shrunk or split.
-		 */
-		if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
-		    (ZAP_HASH_IDX(zc->zc_hash,
-		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
-		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
-			zap_put_leaf(zc->zc_leaf);
-			zc->zc_leaf = NULL;
-		}
-	}
-
-again:
-	if (zc->zc_leaf == NULL) {
-		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
-		    &zc->zc_leaf);
-		if (err != 0)
-			return (err);
-	}
-	l = zc->zc_leaf;
-
-	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
-
-	if (err == ENOENT) {
-		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) {
-			zc->zc_hash = -1ULL;
-			zc->zc_cd = 0;
-		} else {
-			uint64_t nocare = (1ULL <<
-			    (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
-
-			zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
-			zc->zc_cd = 0;
-
-			if (zc->zc_hash == 0) {
-				zc->zc_hash = -1ULL;
-			} else {
-				zap_put_leaf(zc->zc_leaf);
-				zc->zc_leaf = NULL;
-				goto again;
-			}
-		}
-	}
-
-	if (err == 0) {
-		zc->zc_hash = zeh.zeh_hash;
-		zc->zc_cd = zeh.zeh_cd;
-		za->za_integer_length = zeh.zeh_integer_size;
-		za->za_num_integers = zeh.zeh_num_integers;
-		if (zeh.zeh_num_integers == 0) {
-			za->za_first_integer = 0;
-		} else {
-			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
-			ASSERT(err == 0 || err == EOVERFLOW);
-		}
-		err = zap_entry_read_name(zap, &zeh,
-		    za->za_name_len, za->za_name);
-		ASSERT0(err);
-
-		za->za_normalization_conflict =
-		    zap_entry_normalization_conflict(&zeh,
-		    NULL, za->za_name, zap);
-	}
-	rw_exit(&zc->zc_leaf->l_rwlock);
-	return (err);
+	zc->zc_objset = os;
+	zc->zc_zap = NULL;
+	zc->zc_leaf = NULL;
+	zc->zc_zapobj = zapobj;
+	zc->zc_serialized = serialized;
+	zc->zc_hash = 0;
+	zc->zc_cd = 0;
+	zc->zc_prefetch = prefetch;
 }
 
-static void
-zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+/*
+ * Initialize a cursor at the beginning of the ZAP object.  The entire
+ * ZAP object will be prefetched.
+ */
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
-	uint64_t lastblk = 0;
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
 
-	/*
-	 * NB: if a leaf has more pointers than an entire ptrtbl block
-	 * can hold, then it'll be accounted for more than once, since
-	 * we won't have lastblk.
-	 */
-	for (int i = 0; i < len; i++) {
-		zap_leaf_t *l;
-
-		if (tbl[i] == lastblk)
-			continue;
-		lastblk = tbl[i];
-
-		int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
-		if (err == 0) {
-			zap_leaf_stats(zap, l, zs);
-			zap_put_leaf(l);
-		}
-	}
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
 }
 
 void
-fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+zap_cursor_fini(zap_cursor_t *zc)
 {
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	zs->zs_blocksize = 1ULL << bs;
-
-	/*
-	 * Set zap_phys_t fields
-	 */
-	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
-	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
-	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
-	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
-	zs->zs_magic = zap_f_phys(zap)->zap_magic;
-	zs->zs_salt = zap_f_phys(zap)->zap_salt;
-
-	/*
-	 * Set zap_ptrtbl fields
-	 */
-	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
-	zs->zs_ptrtbl_blks_copied =
-	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
-	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
-	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
-	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
-		/* the ptrtbl is entirely in the header block. */
-		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
-		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
-	} else {
-		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
-		    ZIO_PRIORITY_SYNC_READ);
-
-		for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
-		    b++) {
-			dmu_buf_t *db;
-			int err;
-
-			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
-			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
-			    FTAG, &db, DMU_READ_NO_PREFETCH);
-			if (err == 0) {
-				zap_stats_ptrtbl(zap, db->db_data,
-				    1<<(bs-3), zs);
-				dmu_buf_rele(db, FTAG);
-			}
-		}
+	if (zc->zc_zap) {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+		zap_unlockdir(zc->zc_zap, NULL);
+		zc->zc_zap = NULL;
 	}
+	if (zc->zc_leaf) {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+		zap_put_leaf(zc->zc_leaf);
+		zc->zc_leaf = NULL;
+	}
+	zc->zc_objset = NULL;
 }
 
-/*
- * Find last allocated block and update freeblk.
- */
-static void
-zap_trunc(zap_t *zap)
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
-	uint64_t nentries;
-	uint64_t lastblk;
+	int err;
 
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	if (zc->zc_hash == -1ULL)
+		return (SET_ERROR(ENOENT));
 
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
-		/* External ptrtbl */
-		nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-		lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+	if (zc->zc_zap == NULL) {
+		int hb;
+		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
+		if (err != 0)
+			return (err);
+
+		/*
+		 * To support zap_cursor_init_serialized, advance, retrieve,
+		 * we must add to the existing zc_cd, which may already
+		 * be 1 due to the zap_cursor_advance.
+		 */
+		ASSERT0(zc->zc_hash);
+		hb = zap_hashbits(zc->zc_zap);
+		zc->zc_hash = zc->zc_serialized << (64 - hb);
+		zc->zc_cd += zc->zc_serialized >> hb;
+		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+			zc->zc_cd = 0;
 	} else {
-		/* Embedded ptrtbl */
-		nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		lastblk = 0;
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
+	if (!zc->zc_zap->zap_ismicro) {
+		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+	} else {
+		zfs_btree_index_t idx;
+		mzap_ent_t mze_tofind;
 
-	for (uint64_t idx = 0; idx < nentries; idx++) {
-		uint64_t blk;
-		if (zap_idx_to_blk(zap, idx, &blk) != 0)
-			return;
-		if (blk > lastblk)
-			lastblk = blk;
+		mze_tofind.mze_hash = zc->zc_hash >> 32;
+		mze_tofind.mze_cd = zc->zc_cd;
+
+		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
+		    &mze_tofind, &idx);
+		if (mze == NULL) {
+			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
+			    &idx, &idx);
+		}
+		if (mze) {
+			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
+			za->za_normalization_conflict =
+			    mzap_normalization_conflict(zc->zc_zap, NULL,
+			    mze, &idx);
+			za->za_integer_length = 8;
+			za->za_num_integers = 1;
+			za->za_first_integer = mzep->mze_value;
+			(void) strlcpy(za->za_name, mzep->mze_name,
+			    za->za_name_len);
+			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
+			zc->zc_cd = mze->mze_cd;
+			err = 0;
+		} else {
+			zc->zc_hash = -1ULL;
+			err = SET_ERROR(ENOENT);
+		}
 	}
-
-	ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
-
-	zap_f_phys(zap)->zap_freeblk = lastblk + 1;
-}
-
-/*
- * ZAP shrinking algorithm.
- *
- * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
- * only if it has a sibling. Sibling leaves have the same prefix length and
- * their prefixes differ only by the least significant (sibling) bit. We require
- * both siblings to be empty. This eliminates a need to rehash the non-empty
- * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
- * entries of the removed leaf to point out to the remaining leaf. Prefix length
- * of the remaining leaf is decremented. As a result, it has a new prefix and it
- * might have a new sibling. So, we repeat the process.
- *
- * Steps:
- * 1. Check if a sibling leaf (sl) exists and it is empty.
- * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
- * 3. Release the sibling (sl) to derefer it again with WRITER lock.
- * 4. Upgrade zapdir lock to WRITER (once).
- * 5. Derefer released leaves again.
- * 6. If it is needed, recheck whether both leaves are still siblings and empty.
- * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
- * the remaining leaf (slbit 0).
- * 8. Free disk block of the removed leaf (dmu_free_range).
- * 9. Decrement prefix_len of the remaining leaf.
- * 10. Repeat the steps.
- */
-static int
-zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
-{
-	zap_t *zap = zn->zn_zap;
-	int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-	uint64_t hash = zn->zn_hash;
-	uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
-	uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-	boolean_t trunc = B_FALSE;
-	int err = 0;
-
-	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries);
-	ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
-
-	boolean_t writer = B_FALSE;
-
-	/*
-	 * To avoid deadlock always deref leaves in the same order -
-	 * sibling 0 first, then sibling 1.
-	 */
-	while (prefix_len) {
-		zap_leaf_t *sl;
-		int64_t prefix_diff = zt_shift - prefix_len;
-		uint64_t sl_prefix = prefix ^ 1;
-		uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
-		int slbit = prefix & 1;
-
-		ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries);
-
-		/*
-		 * Check if there is a sibling by reading ptrtbl ptrs.
-		 */
-		if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
-			break;
-
-		/*
-		 * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
-		 */
-		if (slbit == 1) {
-			zap_put_leaf(l);
-			l = NULL;
-		}
-
-		/*
-		 * Dereference sibling leaf and check if it is empty.
-		 */
-		if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
-		    &sl)) != 0)
-			break;
-
-		ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
-
-		/*
-		 * Check if we have a sibling and it is empty.
-		 */
-		if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
-		    zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
-			zap_put_leaf(sl);
-			break;
-		}
-
-		zap_put_leaf(sl);
-
-		/*
-		 * If there two empty sibling, we have work to do, so
-		 * we need to lock ZAP ptrtbl as WRITER.
-		 */
-		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
-			/* We failed to upgrade */
-			if (l != NULL) {
-				zap_put_leaf(l);
-				l = NULL;
-			}
-
-			/*
-			 * Usually, the right way to upgrade from a READER lock
-			 * to a WRITER lock is to call zap_unlockdir() and
-			 * zap_lockdir(), but we do not have a tag. Instead,
-			 * we do it in more sophisticated way.
-			 */
-			rw_exit(&zap->zap_rwlock);
-			rw_enter(&zap->zap_rwlock, RW_WRITER);
-			dmu_buf_will_dirty(zap->zap_dbuf, tx);
-
-			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-			writer = B_TRUE;
-		}
-
-		/*
-		 * Here we have WRITER lock for ptrtbl.
-		 * Now, we need a WRITER lock for both siblings leaves.
-		 * Also, we have to recheck if the leaves are still siblings
-		 * and still empty.
-		 */
-		if (l == NULL) {
-			/* sibling 0 */
-			if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
-			    tx, RW_WRITER, &l)) != 0)
-				break;
-
-			/*
-			 * The leaf isn't empty anymore or
-			 * it was shrunk/split while our locks were down.
-			 */
-			if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
-			    zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
-				break;
-		}
-
-		/* sibling 1 */
-		if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
-		    RW_WRITER, &sl)) != 0)
-			break;
-
-		/*
-		 * The leaf isn't empty anymore or
-		 * it was shrunk/split while our locks were down.
-		 */
-		if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
-		    zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
-			zap_put_leaf(sl);
-			break;
-		}
-
-		/* If we have gotten here, we have a leaf to collapse */
-		uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
-		uint64_t nptrs = (1ULL << prefix_diff);
-		uint64_t sl_blkid = sl->l_blkid;
-
-		/*
-		 * Set ptrtbl entries to point out to the slibling 0 blkid
-		 */
-		if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
-		    tx)) != 0) {
-			zap_put_leaf(sl);
-			break;
-		}
-
-		/*
-		 * Free sibling 1 disk block.
-		 */
-		int bs = FZAP_BLOCK_SHIFT(zap);
-		if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
-			trunc = B_TRUE;
-
-		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
-		    sl_blkid << bs, 1 << bs, tx);
-		zap_put_leaf(sl);
-
-		zap_f_phys(zap)->zap_num_leafs--;
-
-		/*
-		 * Update prefix and prefix_len.
-		 */
-		zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
-		zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
-
-		prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
-		prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-	}
-
-	if (trunc)
-		zap_trunc(zap);
-
-	if (l != NULL)
-		zap_put_leaf(l);
-
+	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
-ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
-	"When iterating ZAP object, prefetch it");
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return;
+	zc->zc_cd++;
+}
 
-ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
-	"Enable ZAP shrinking");
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return (-1ULL);
+	if (zc->zc_zap == NULL)
+		return (zc->zc_serialized);
+	ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
+	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+	/*
+	 * We want to keep the high 32 bits of the cursor zero if we can, so
+	 * that 32-bit programs can access this.  So usually use a small
+	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+	 * of the cursor.
+	 *
+	 * [ collision differentiator | zap_hashbits()-bit hash value ]
+	 */
+	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
+}
+
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+
+	memset(zs, 0, sizeof (zap_stats_t));
+
+	if (zap->zap_ismicro) {
+		zs->zs_blocksize = zap->zap_dbuf->db_size;
+		zs->zs_num_entries = zap->zap_m.zap_num_entries;
+		zs->zs_num_blocks = 1;
+	} else {
+		fzap_get_stats(zap, zs);
+	}
+	zap_unlockdir(zap, FTAG);
+	return (0);
+}
+
+EXPORT_SYMBOL(zap_create);
+EXPORT_SYMBOL(zap_create_dnsize);
+EXPORT_SYMBOL(zap_create_norm);
+EXPORT_SYMBOL(zap_create_norm_dnsize);
+EXPORT_SYMBOL(zap_create_flags);
+EXPORT_SYMBOL(zap_create_flags_dnsize);
+EXPORT_SYMBOL(zap_create_claim);
+EXPORT_SYMBOL(zap_create_claim_norm);
+EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
+EXPORT_SYMBOL(zap_create_hold);
+EXPORT_SYMBOL(zap_destroy);
+EXPORT_SYMBOL(zap_lookup);
+EXPORT_SYMBOL(zap_lookup_by_dnode);
+EXPORT_SYMBOL(zap_lookup_norm);
+EXPORT_SYMBOL(zap_lookup_uint64);
+EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode);
+EXPORT_SYMBOL(zap_contains);
+EXPORT_SYMBOL(zap_prefetch);
+EXPORT_SYMBOL(zap_prefetch_uint64);
+EXPORT_SYMBOL(zap_prefetch_object);
+EXPORT_SYMBOL(zap_add);
+EXPORT_SYMBOL(zap_add_by_dnode);
+EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
+EXPORT_SYMBOL(zap_update);
+EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
+EXPORT_SYMBOL(zap_length);
+EXPORT_SYMBOL(zap_length_uint64);
+EXPORT_SYMBOL(zap_length_uint64_by_dnode);
+EXPORT_SYMBOL(zap_remove);
+EXPORT_SYMBOL(zap_remove_by_dnode);
+EXPORT_SYMBOL(zap_remove_norm);
+EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
+EXPORT_SYMBOL(zap_count);
+EXPORT_SYMBOL(zap_count_by_dnode);
+EXPORT_SYMBOL(zap_value_search);
+EXPORT_SYMBOL(zap_join);
+EXPORT_SYMBOL(zap_join_increment);
+EXPORT_SYMBOL(zap_add_int);
+EXPORT_SYMBOL(zap_remove_int);
+EXPORT_SYMBOL(zap_lookup_int);
+EXPORT_SYMBOL(zap_increment_int);
+EXPORT_SYMBOL(zap_add_int_key);
+EXPORT_SYMBOL(zap_lookup_int_key);
+EXPORT_SYMBOL(zap_increment);
+EXPORT_SYMBOL(zap_cursor_init);
+EXPORT_SYMBOL(zap_cursor_fini);
+EXPORT_SYMBOL(zap_cursor_retrieve);
+EXPORT_SYMBOL(zap_cursor_advance);
+EXPORT_SYMBOL(zap_cursor_serialize);
+EXPORT_SYMBOL(zap_cursor_init_serialized);
+EXPORT_SYMBOL(zap_get_stats);
diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c
new file mode 100644
index 00000000000..6e2f076cfc3
--- /dev/null
+++ b/module/zfs/zap_fat.c
@@ -0,0 +1,1502 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries.  There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS.  Typically they would iterate over everything, but we
+ * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs.  The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ *    get the first 100 and then wait for the user to hit "next page", which
+ *    they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ *    the zfs-specific implementation of the directory's st_size (which is
+ *    the number of entries).
+ */
+static int zap_iterate_prefetch = B_TRUE;
+
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+	uint64_t block_type = *(uint64_t *)vbuf;
+
+	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+		zap_leaf_byteswap(vbuf, size);
+	else {
+		/* it's a ptrtbl block */
+		byteswap_uint64_array(vbuf, size);
+	}
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
+{
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	zap->zap_ismicro = FALSE;
+
+	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
+	zap->zap_dbu.dbu_evict_func_async = NULL;
+
+	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0);
+	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
+
+	zap_phys_t *zp = zap_f_phys(zap);
+	/*
+	 * explicitly zero it since it might be coming from an
+	 * initialized microzap
+	 */
+	memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size);
+	zp->zap_block_type = ZBT_HEADER;
+	zp->zap_magic = ZAP_MAGIC;
+
+	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
+	zp->zap_num_leafs = 1;
+	zp->zap_num_entries = 0;
+	zp->zap_salt = zap->zap_salt;
+	zp->zap_normflags = zap->zap_normflags;
+	zp->zap_flags = flags;
+
+	/* block 1 will be the first leaf */
+	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+	/*
+	 * set up block 1 - the first leaf
+	 */
+	dmu_buf_t *db;
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
+	dmu_buf_will_dirty(db, tx);
+
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	l->l_dbuf = db;
+
+	zap_leaf_init(l, zp->zap_normflags != 0);
+
+	kmem_free(l, sizeof (zap_leaf_t));
+	dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+	if (RW_WRITE_HELD(&zap->zap_rwlock))
+		return (1);
+	if (rw_tryupgrade(&zap->zap_rwlock)) {
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+    dmu_tx_t *tx)
+{
+	uint64_t newblk;
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	int hepb = 1<<(bs-4);
+	/* hepb = half the number of entries in a block */
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+	ASSERT(tbl->zt_numblks > 0);
+
+	if (tbl->zt_nextblk != 0) {
+		newblk = tbl->zt_nextblk;
+	} else {
+		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+		tbl->zt_nextblk = newblk;
+		ASSERT0(tbl->zt_blks_copied);
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+
+	/*
+	 * Copy the ptrtbl from the old to new location.
+	 */
+
+	uint64_t b = tbl->zt_blks_copied;
+	dmu_buf_t *db_old;
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+
+	/* first half of entries in old[b] go to new[2*b+0] */
+	dmu_buf_t *db_new;
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func(db_old->db_data, db_new->db_data, hepb);
+	dmu_buf_rele(db_new, FTAG);
+
+	/* second half of entries in old[b] go to new[2*b+1] */
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func((uint64_t *)db_old->db_data + hepb,
+	    db_new->db_data, hepb);
+	dmu_buf_rele(db_new, FTAG);
+
+	dmu_buf_rele(db_old, FTAG);
+
+	tbl->zt_blks_copied++;
+
+	dprintf("copied block %llu of %llu\n",
+	    (u_longlong_t)tbl->zt_blks_copied,
+	    (u_longlong_t)tbl->zt_numblks);
+
+	if (tbl->zt_blks_copied == tbl->zt_numblks) {
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+		tbl->zt_blk = newblk;
+		tbl->zt_numblks *= 2;
+		tbl->zt_shift++;
+		tbl->zt_nextblk = 0;
+		tbl->zt_blks_copied = 0;
+
+		dprintf("finished; numblocks now %llu (%uk entries)\n",
+		    (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+	}
+
+	return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+    dmu_tx_t *tx)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+
+	dprintf("storing %llx at index %llx\n", (u_longlong_t)val,
+	    (u_longlong_t)idx);
+
+	uint64_t blk = idx >> (bs-3);
+	uint64_t off = idx & ((1<<(bs-3))-1);
+
+	dmu_buf_t *db;
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	dmu_buf_will_dirty(db, tx);
+
+	if (tbl->zt_nextblk != 0) {
+		uint64_t idx2 = idx * 2;
+		uint64_t blk2 = idx2 >> (bs-3);
+		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+		dmu_buf_t *db2;
+
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+		    DMU_READ_NO_PREFETCH);
+		if (err != 0) {
+			dmu_buf_rele(db, FTAG);
+			return (err);
+		}
+		dmu_buf_will_dirty(db2, tx);
+		((uint64_t *)db2->db_data)[off2] = val;
+		((uint64_t *)db2->db_data)[off2+1] = val;
+		dmu_buf_rele(db2, FTAG);
+	}
+
+	((uint64_t *)db->db_data)[off] = val;
+	dmu_buf_rele(db, FTAG);
+
+	return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	uint64_t blk = idx >> (bs-3);
+	uint64_t off = idx & ((1<<(bs-3))-1);
+
+	dmu_buf_t *db;
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	*valp = ((uint64_t *)db->db_data)[off];
+	dmu_buf_rele(db, FTAG);
+
+	if (tbl->zt_nextblk != 0) {
+		/*
+		 * read the nextblk for the sake of i/o error checking,
+		 * so that zap_table_load() will catch errors for
+		 * zap_table_store.
+		 */
+		blk = (idx*2) >> (bs-3);
+
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+		    DMU_READ_NO_PREFETCH);
+		if (err == 0)
+			dmu_buf_rele(db, FTAG);
+	}
+	return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+	for (int i = 0; i < n; i++) {
+		uint64_t lb = src[i];
+		dst[2 * i + 0] = lb;
+		dst[2 * i + 1] = lb;
+	}
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+	/*
+	 * The pointer table should never use more hash bits than we
+	 * have (otherwise we'd be using useless zero bits to index it).
+	 * If we are within 2 bits of running out, stop growing, since
+	 * this is already an aberrant condition.
+	 */
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+		return (SET_ERROR(ENOSPC));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+		/*
+		 * We are outgrowing the "embedded" ptrtbl (the one
+		 * stored in the header block).  Give it its own entire
+		 * block, which will double the size of the ptrtbl.
+		 */
+		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
+
+		uint64_t newblk = zap_allocate_blocks(zap, 1);
+		dmu_buf_t *db_new;
+		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+		    DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			return (err);
+		dmu_buf_will_dirty(db_new, tx);
+		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		dmu_buf_rele(db_new, FTAG);
+
+		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
+
+		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
+		    (FZAP_BLOCK_SHIFT(zap)-3));
+
+		return (0);
+	} else {
+		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
+		    zap_ptrtbl_transfer, tx));
+	}
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+	dmu_buf_will_dirty(zap->zap_dbuf, tx);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+	zap_f_phys(zap)->zap_num_entries += delta;
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
+	zap_f_phys(zap)->zap_freeblk += nblocks;
+	return (newblk);
+}
+
+static void
+zap_leaf_evict_sync(void *dbu)
+{
+	zap_leaf_t *l = dbu;
+
+	rw_destroy(&l->l_rwlock);
+	kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	uint64_t blkid = zap_allocate_blocks(zap, 1);
+	dmu_buf_t *db = NULL;
+
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db,
+	    DMU_READ_NO_PREFETCH));
+
+	/*
+	 * Create the leaf structure and stash it on the dbuf. If zap was
+	 * recent shrunk or truncated, the dbuf might have been sitting in the
+	 * cache waiting to be evicted, and so still have the old leaf attached
+	 * to it. If so, just reuse it.
+	 */
+	zap_leaf_t *l = dmu_buf_get_user(db);
+	if (l == NULL) {
+		l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+		l->l_blkid = blkid;
+		l->l_dbuf = db;
+		rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
+		dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL,
+		    &l->l_dbuf);
+		dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
+	} else {
+		ASSERT3U(l->l_blkid, ==, blkid);
+		ASSERT3P(l->l_dbuf, ==, db);
+	}
+
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	dmu_buf_will_dirty(l->l_dbuf, tx);
+
+	zap_leaf_init(l, zap->zap_normflags != 0);
+
+	zap_f_phys(zap)->zap_num_leafs++;
+
+	return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+	ASSERT(!zap->zap_ismicro);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+	*count = zap_f_phys(zap)->zap_num_entries;
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+	return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+	rw_exit(&l->l_rwlock);
+	dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+	ASSERT(blkid != 0);
+
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = blkid;
+	l->l_bs = highbit64(db->db_size) - 1;
+	l->l_dbuf = db;
+
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
+
+	rw_exit(&l->l_rwlock);
+	if (winner != NULL) {
+		/* someone else set it first */
+		zap_leaf_evict_sync(&l->l_dbu);
+		l = winner;
+	}
+
+	/*
+	 * lhr_pad was previously used for the next leaf in the leaf
+	 * chain.  There should be no chained leafs (as we have removed
+	 * support for them).
+	 */
+	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
+
+	/*
+	 * There should be more hash entries than there can be
+	 * chunks to put in the hash table
+	 */
+	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+	/* The chunks should begin at the end of the hash table */
+	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
+	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+	/* The chunks should end at the end of the block */
+	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
+
+	return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+    zap_leaf_t **lp)
+{
+	dmu_buf_t *db;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	/*
+	 * If system crashed just after dmu_free_long_range in zfs_rmnode, we
+	 * would be left with an empty xattr dir in delete queue. blkid=0
+	 * would be passed in when doing zfs_purgedir. If that's the case we
+	 * should just return immediately. The underlying objects should
+	 * already be freed, so this should be perfectly fine.
+	 */
+	if (blkid == 0)
+		return (SET_ERROR(ENOENT));
+
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+
+	ASSERT3U(db->db_object, ==, zap->zap_object);
+	ASSERT3U(db->db_offset, ==, blkid << bs);
+	ASSERT3U(db->db_size, ==, 1 << bs);
+	ASSERT(blkid != 0);
+
+	zap_leaf_t *l = dmu_buf_get_user(db);
+
+	if (l == NULL)
+		l = zap_open_leaf(blkid, db);
+
+	rw_enter(&l->l_rwlock, lt);
+	/*
+	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
+	 * causing ASSERT below to fail.
+	 */
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+	ASSERT3U(l->l_blkid, ==, blkid);
+	ASSERT3P(l->l_dbuf, ==, db);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	*lp = l;
+	return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+		ASSERT3U(idx, <,
+		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+		return (0);
+	} else {
+		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
+		    idx, valp));
+	}
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+		return (0);
+	} else {
+		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
+		    idx, blk, tx));
+	}
+}
+
+static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+    dmu_tx_t *tx)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	int epb = bs >> 3; /* entries per block */
+	int err = 0;
+
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	/*
+	 * Check for i/o errors
+	 */
+	for (int i = 0; i < nptrs; i += epb) {
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, idx + i, &blk);
+		if (err != 0) {
+			return (err);
+		}
+	}
+
+	for (int i = 0; i < nptrs; i++) {
+		err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+		ASSERT0(err); /* we checked for i/o errors above */
+		if (err != 0)
+			break;
+	}
+
+	return (err);
+}
+
+#define	ZAP_PREFIX_HASH(pref, pref_len)	((pref) << (64 - (pref_len)))
+#define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+	uint64_t nptrs = (1 << pref_diff);
+	uint64_t first;
+	uint64_t last;
+
+	ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+	if (zap_idx_to_blk(zap, idx, &first) != 0)
+		return (0);
+
+	if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+		return (0);
+
+	if (first != last)
+		return (0);
+	return (first);
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+	uint64_t blk;
+
+	ASSERT(zap->zap_dbuf == NULL ||
+	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
+
+	/* Reality check for corrupt zap objects (leaf or header). */
+	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
+	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
+	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
+		return (SET_ERROR(EIO));
+	}
+
+	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	int err = zap_idx_to_blk(zap, idx, &blk);
+	if (err != 0)
+		return (err);
+	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
+	return (err);
+}
+
+static int
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+    const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+	zap_t *zap = zn->zn_zap;
+	uint64_t hash = zn->zn_hash;
+	int err;
+	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+	if (zap_tryupgradedir(zap, tx) == 0 ||
+	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+		/* We failed to upgrade, or need to grow the pointer table */
+		objset_t *os = zap->zap_objset;
+		uint64_t object = zap->zap_object;
+
+		zap_put_leaf(l);
+		*lp = l = NULL;
+		zap_unlockdir(zap, tag);
+		err = zap_lockdir(os, object, tx, RW_WRITER,
+		    FALSE, FALSE, tag, &zn->zn_zap);
+		zap = zn->zn_zap;
+		if (err != 0)
+			return (err);
+		ASSERT(!zap->zap_ismicro);
+
+		while (old_prefix_len ==
+		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+			err = zap_grow_ptrtbl(zap, tx);
+			if (err != 0)
+				return (err);
+		}
+
+		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+		if (err != 0)
+			return (err);
+
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
+			/* it split while our locks were down */
+			*lp = l;
+			return (0);
+		}
+	}
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	    (old_prefix_len + 1);
+	uint64_t sibling =
+	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+	/* check for i/o errors before doing zap_leaf_split */
+	for (int i = 0; i < (1ULL << prefix_diff); i++) {
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, sibling + i, &blk);
+		if (err != 0)
+			return (err);
+		ASSERT3U(blk, ==, l->l_blkid);
+	}
+
+	zap_leaf_t *nl = zap_create_leaf(zap, tx);
+	zap_leaf_split(l, nl, zap->zap_normflags != 0);
+
+	/* set sibling pointers */
+	for (int i = 0; i < (1ULL << prefix_diff); i++) {
+		err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
+		ASSERT0(err); /* we checked for i/o errors above */
+	}
+
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0);
+
+	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
+		/* we want the sibling */
+		zap_put_leaf(l);
+		*lp = nl;
+	} else {
+		zap_put_leaf(nl);
+		*lp = l;
+	}
+
+	return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+    const void *tag, dmu_tx_t *tx)
+{
+	zap_t *zap = zn->zn_zap;
+	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+	zap_put_leaf(l);
+
+	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
+		/*
+		 * We are in the middle of growing the pointer table, or
+		 * this leaf will soon make us grow it.
+		 */
+		if (zap_tryupgradedir(zap, tx) == 0) {
+			objset_t *os = zap->zap_objset;
+			uint64_t zapobj = zap->zap_object;
+
+			zap_unlockdir(zap, tag);
+			int err = zap_lockdir(os, zapobj, tx,
+			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
+			zap = zn->zn_zap;
+			if (err != 0)
+				return;
+		}
+
+		/* could have finished growing while our locks were down */
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
+			(void) zap_grow_ptrtbl(zap, tx);
+	}
+}
+
+static int
+fzap_checkname(zap_name_t *zn)
+{
+	uint32_t maxnamelen = zn->zn_normbuf_len;
+	uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen;
+	/* Only allow directory zap to have longname */
+	if (len > maxnamelen ||
+	    (len > ZAP_MAXNAMELEN &&
+	    zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS))
+		return (SET_ERROR(ENAMETOOLONG));
+	return (0);
+}
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+	/* Only integer sizes supported by C */
+	switch (integer_size) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (integer_size * num_integers > ZAP_MAXVALUELEN)
+		return (SET_ERROR(E2BIG));
+
+	return (0);
+}
+
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+	int err = fzap_checkname(zn);
+	if (err != 0)
+		return (err);
+	return (fzap_checksize(integer_size, num_integers));
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+int
+fzap_lookup(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    char *realname, int rn_len, boolean_t *ncp,
+    uint64_t *actual_num_integers)
+{
+	zap_leaf_t *l;
+	zap_entry_handle_t zeh;
+
+	int err = fzap_checkname(zn);
+	if (err != 0)
+		return (err);
+
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
+		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+			zap_put_leaf(l);
+			return (err);
+		}
+
+		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+		if (err == 0 && actual_num_integers != NULL)
+			*actual_num_integers = zeh.zeh_num_integers;
+		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
+		if (ncp) {
+			*ncp = zap_entry_normalization_conflict(&zeh,
+			    zn, NULL, zn->zn_zap);
+		}
+	}
+
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_add_cd(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	int err;
+	zap_entry_handle_t zeh;
+	zap_t *zap = zn->zn_zap;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(!zap->zap_ismicro);
+	ASSERT0(fzap_check(zn, integer_size, num_integers));
+
+	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
+retry:
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
+		err = SET_ERROR(EEXIST);
+		goto out;
+	}
+	if (err != ENOENT)
+		goto out;
+
+	err = zap_entry_create(l, zn, cd,
+	    integer_size, num_integers, val, &zeh);
+
+	if (err == 0) {
+		zap_increment_num_entries(zap, 1, tx);
+	} else if (err == EAGAIN) {
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
+		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
+		if (err == 0)
+			goto retry;
+	}
+
+out:
+	if (l != NULL) {
+		if (err == ENOSPC)
+			zap_put_leaf(l);
+		else
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	}
+	return (err);
+}
+
+int
+fzap_add(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, const void *tag, dmu_tx_t *tx)
+{
+	int err = fzap_check(zn, integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	return (fzap_add_cd(zn, integer_size, num_integers,
+	    val, ZAP_NEED_CD, tag, tx));
+}
+
+int
+fzap_update(zap_name_t *zn,
+    int integer_size, uint64_t num_integers, const void *val,
+    const void *tag, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	int err;
+	boolean_t create;
+	zap_entry_handle_t zeh;
+	zap_t *zap = zn->zn_zap;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	err = fzap_check(zn, integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
+retry:
+	err = zap_leaf_lookup(l, zn, &zeh);
+	create = (err == ENOENT);
+	ASSERT(err == 0 || err == ENOENT);
+
+	if (create) {
+		err = zap_entry_create(l, zn, ZAP_NEED_CD,
+		    integer_size, num_integers, val, &zeh);
+		if (err == 0)
+			zap_increment_num_entries(zap, 1, tx);
+	} else {
+		err = zap_entry_update(&zeh, integer_size, num_integers, val);
+	}
+
+	if (err == EAGAIN) {
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
+		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
+		if (err == 0)
+			goto retry;
+	}
+
+	if (l != NULL) {
+		if (err == ENOSPC)
+			zap_put_leaf(l);
+		else
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	}
+	return (err);
+}
+
+int
+fzap_length(zap_name_t *zn,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_leaf_t *l;
+	int err;
+	zap_entry_handle_t zeh;
+
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err != 0)
+		goto out;
+
+	if (integer_size != NULL)
+		*integer_size = zeh.zeh_integer_size;
+	if (num_integers != NULL)
+		*num_integers = zeh.zeh_num_integers;
+out:
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	int err;
+	zap_entry_handle_t zeh;
+
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
+		zap_entry_remove(&zeh);
+		zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+		if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+		    zap_shrink_enabled)
+			return (zap_shrink(zn, l, tx));
+	}
+	zap_put_leaf(l);
+	return (err);
+}
+
+void
+fzap_prefetch(zap_name_t *zn)
+{
+	uint64_t blk;
+	zap_t *zap = zn->zn_zap;
+
+	uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
+	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	if (zap_idx_to_blk(zap, idx, &blk) != 0)
+		return;
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
+	    ZIO_PRIORITY_SYNC_READ);
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+	int err;
+	zap_entry_handle_t zeh;
+	zap_leaf_t *l;
+
+	/* retrieve the next entry at or after zc_hash/zc_cd */
+	/* if no entry, return ENOENT */
+
+	/*
+	 * If we are reading from the beginning, we're almost certain to
+	 * iterate over the entire ZAP object.  If there are multiple leaf
+	 * blocks (freeblk > 2), prefetch the whole object (up to
+	 * dmu_prefetch_max bytes), so that we read the leaf blocks
+	 * concurrently. (Unless noprefetch was requested via
+	 * zap_cursor_init_noprefetch()).
+	 */
+	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
+		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+
+	if (zc->zc_leaf) {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+
+		/*
+		 * The leaf was either shrunk or split.
+		 */
+		if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+		    (ZAP_HASH_IDX(zc->zc_hash,
+		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+			zap_put_leaf(zc->zc_leaf);
+			zc->zc_leaf = NULL;
+		}
+	}
+
+again:
+	if (zc->zc_leaf == NULL) {
+		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+		    &zc->zc_leaf);
+		if (err != 0)
+			return (err);
+	}
+	l = zc->zc_leaf;
+
+	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+	if (err == ENOENT) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) {
+			zc->zc_hash = -1ULL;
+			zc->zc_cd = 0;
+		} else {
+			uint64_t nocare = (1ULL <<
+			    (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
+
+			zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+			zc->zc_cd = 0;
+
+			if (zc->zc_hash == 0) {
+				zc->zc_hash = -1ULL;
+			} else {
+				zap_put_leaf(zc->zc_leaf);
+				zc->zc_leaf = NULL;
+				goto again;
+			}
+		}
+	}
+
+	if (err == 0) {
+		zc->zc_hash = zeh.zeh_hash;
+		zc->zc_cd = zeh.zeh_cd;
+		za->za_integer_length = zeh.zeh_integer_size;
+		za->za_num_integers = zeh.zeh_num_integers;
+		if (zeh.zeh_num_integers == 0) {
+			za->za_first_integer = 0;
+		} else {
+			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+			ASSERT(err == 0 || err == EOVERFLOW);
+		}
+		err = zap_entry_read_name(zap, &zeh,
+		    za->za_name_len, za->za_name);
+		ASSERT0(err);
+
+		za->za_normalization_conflict =
+		    zap_entry_normalization_conflict(&zeh,
+		    NULL, za->za_name, zap);
+	}
+	rw_exit(&zc->zc_leaf->l_rwlock);
+	return (err);
+}
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+	uint64_t lastblk = 0;
+
+	/*
+	 * NB: if a leaf has more pointers than an entire ptrtbl block
+	 * can hold, then it'll be accounted for more than once, since
+	 * we won't have lastblk.
+	 */
+	for (int i = 0; i < len; i++) {
+		zap_leaf_t *l;
+
+		if (tbl[i] == lastblk)
+			continue;
+		lastblk = tbl[i];
+
+		int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+		if (err == 0) {
+			zap_leaf_stats(zap, l, zs);
+			zap_put_leaf(l);
+		}
+	}
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	zs->zs_blocksize = 1ULL << bs;
+
+	/*
+	 * Set zap_phys_t fields
+	 */
+	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+	zs->zs_magic = zap_f_phys(zap)->zap_magic;
+	zs->zs_salt = zap_f_phys(zap)->zap_salt;
+
+	/*
+	 * Set zap_ptrtbl fields
+	 */
+	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
+	zs->zs_ptrtbl_blks_copied =
+	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+		/* the ptrtbl is entirely in the header block. */
+		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+	} else {
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+		    b++) {
+			dmu_buf_t *db;
+			int err;
+
+			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
+			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
+			    FTAG, &db, DMU_READ_NO_PREFETCH);
+			if (err == 0) {
+				zap_stats_ptrtbl(zap, db->db_data,
+				    1<<(bs-3), zs);
+				dmu_buf_rele(db, FTAG);
+			}
+		}
+	}
+}
+
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+	uint64_t nentries;
+	uint64_t lastblk;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+		/* External ptrtbl */
+		nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+		lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+	} else {
+		/* Embedded ptrtbl */
+		nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		lastblk = 0;
+	}
+
+	for (uint64_t idx = 0; idx < nentries; idx++) {
+		uint64_t blk;
+		if (zap_idx_to_blk(zap, idx, &blk) != 0)
+			return;
+		if (blk > lastblk)
+			lastblk = blk;
+	}
+
+	ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+	zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+	zap_t *zap = zn->zn_zap;
+	int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	uint64_t hash = zn->zn_hash;
+	uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+	uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	boolean_t trunc = B_FALSE;
+	int err = 0;
+
+	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries);
+	ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+	boolean_t writer = B_FALSE;
+
+	/*
+	 * To avoid deadlock always deref leaves in the same order -
+	 * sibling 0 first, then sibling 1.
+	 */
+	while (prefix_len) {
+		zap_leaf_t *sl;
+		int64_t prefix_diff = zt_shift - prefix_len;
+		uint64_t sl_prefix = prefix ^ 1;
+		uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+		int slbit = prefix & 1;
+
+		ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries);
+
+		/*
+		 * Check if there is a sibling by reading ptrtbl ptrs.
+		 */
+		if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+			break;
+
+		/*
+		 * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+		 */
+		if (slbit == 1) {
+			zap_put_leaf(l);
+			l = NULL;
+		}
+
+		/*
+		 * Dereference sibling leaf and check if it is empty.
+		 */
+		if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+		    &sl)) != 0)
+			break;
+
+		ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+		/*
+		 * Check if we have a sibling and it is empty.
+		 */
+		if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+		    zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		zap_put_leaf(sl);
+
+		/*
+		 * If there two empty sibling, we have work to do, so
+		 * we need to lock ZAP ptrtbl as WRITER.
+		 */
+		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+			/* We failed to upgrade */
+			if (l != NULL) {
+				zap_put_leaf(l);
+				l = NULL;
+			}
+
+			/*
+			 * Usually, the right way to upgrade from a READER lock
+			 * to a WRITER lock is to call zap_unlockdir() and
+			 * zap_lockdir(), but we do not have a tag. Instead,
+			 * we do it in more sophisticated way.
+			 */
+			rw_exit(&zap->zap_rwlock);
+			rw_enter(&zap->zap_rwlock, RW_WRITER);
+			dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+			writer = B_TRUE;
+		}
+
+		/*
+		 * Here we have WRITER lock for ptrtbl.
+		 * Now, we need a WRITER lock for both siblings leaves.
+		 * Also, we have to recheck if the leaves are still siblings
+		 * and still empty.
+		 */
+		if (l == NULL) {
+			/* sibling 0 */
+			if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+			    tx, RW_WRITER, &l)) != 0)
+				break;
+
+			/*
+			 * The leaf isn't empty anymore or
+			 * it was shrunk/split while our locks were down.
+			 */
+			if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+			    zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+				break;
+		}
+
+		/* sibling 1 */
+		if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+		    RW_WRITER, &sl)) != 0)
+			break;
+
+		/*
+		 * The leaf isn't empty anymore or
+		 * it was shrunk/split while our locks were down.
+		 */
+		if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+		    zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		/* If we have gotten here, we have a leaf to collapse */
+		uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+		uint64_t nptrs = (1ULL << prefix_diff);
+		uint64_t sl_blkid = sl->l_blkid;
+
+		/*
+		 * Set ptrtbl entries to point out to the slibling 0 blkid
+		 */
+		if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+		    tx)) != 0) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		/*
+		 * Free sibling 1 disk block.
+		 */
+		int bs = FZAP_BLOCK_SHIFT(zap);
+		if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+			trunc = B_TRUE;
+
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
+		    sl_blkid << bs, 1 << bs, tx);
+		zap_put_leaf(sl);
+
+		zap_f_phys(zap)->zap_num_leafs--;
+
+		/*
+		 * Update prefix and prefix_len.
+		 */
+		zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+		zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+		prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+		prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	}
+
+	if (trunc)
+		zap_trunc(zap);
+
+	if (l != NULL)
+		zap_put_leaf(l);
+
+	return (err);
+}
+
+ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
+	"When iterating ZAP object, prefetch it");
+
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+	"Enable ZAP shrinking");
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
new file mode 100644
index 00000000000..8788480318f
--- /dev/null
+++ b/module/zfs/zap_impl.c
@@ -0,0 +1,527 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2024, Klara, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+
+/*
+ * This routine "consumes" the caller's hold on the dbuf, which must
+ * have the specified tag.
+ */
+int
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
+{
+	ASSERT0(db->db_offset);
+	objset_t *os = dmu_buf_get_objset(db);
+	uint64_t obj = db->db_object;
+
+	*zapp = NULL;
+
+	if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
+		return (SET_ERROR(EINVAL));
+
+	zap_t *zap = dmu_buf_get_user(db);
+	if (zap == NULL) {
+		zap = mzap_open(db);
+		if (zap == NULL) {
+			/*
+			 * mzap_open() didn't like what it saw on-disk.
+			 * Check for corruption!
+			 */
+			return (SET_ERROR(EIO));
+		}
+	}
+
+	/*
+	 * We're checking zap_ismicro without the lock held, in order to
+	 * tell what type of lock we want.  Once we have some sort of
+	 * lock, see if it really is the right type.  In practice this
+	 * can only be different if it was upgraded from micro to fat,
+	 * and micro wanted WRITER but fat only needs READER.
+	 */
+	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+	rw_enter(&zap->zap_rwlock, lt);
+	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+		/* it was upgraded, now we only need reader */
+		ASSERT(lt == RW_WRITER);
+		ASSERT(RW_READER ==
+		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
+		rw_downgrade(&zap->zap_rwlock);
+		lt = RW_READER;
+	}
+
+	zap->zap_objset = os;
+	zap->zap_dnode = dn;
+
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+
+	ASSERT3P(zap->zap_dbuf, ==, db);
+
+	ASSERT(!zap->zap_ismicro ||
+	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+	if (zap->zap_ismicro && tx && adding &&
+	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+		if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
+			dprintf("upgrading obj %llu: num_entries=%u\n",
+			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
+			*zapp = zap;
+			int err = mzap_upgrade(zapp, tag, tx, 0);
+			if (err != 0)
+				rw_exit(&zap->zap_rwlock);
+			return (err);
+		}
+		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
+		zap->zap_m.zap_num_chunks =
+		    db->db_size / MZAP_ENT_LEN - 1;
+
+		if (newsz > SPA_OLD_MAXBLOCKSIZE) {
+			dsl_dataset_t *ds = dmu_objset_ds(os);
+			if (!dsl_dataset_feature_is_active(ds,
+			    SPA_FEATURE_LARGE_MICROZAP)) {
+				/*
+				 * A microzap just grew beyond the old limit
+				 * for the first time, so we have to ensure the
+				 * feature flag is activated.
+				 * zap_get_micro_max_size() won't let us get
+				 * here if the feature is not enabled, so we
+				 * don't need any other checks beforehand.
+				 *
+				 * Since we're in open context, we can't
+				 * activate the feature directly, so we instead
+				 * flag it on the dataset for next sync.
+				 */
+				dsl_dataset_dirty(ds, tx);
+				mutex_enter(&ds->ds_lock);
+				ds->ds_feature_activation
+				    [SPA_FEATURE_LARGE_MICROZAP] =
+				    (void *)B_TRUE;
+				mutex_exit(&ds->ds_lock);
+			}
+		}
+	}
+
+	*zapp = zap;
+	return (0);
+}
+
+int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+    zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
+		dmu_buf_rele(db, tag);
+	else
+		VERIFY(dnode_add_ref(dn, tag));
+	return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+    zap_t **zapp)
+{
+	dnode_t *dn;
+	dmu_buf_t *db;
+	int err;
+
+	err = dnode_hold(os, obj, tag, &dn);
+	if (err != 0)
+		return (err);
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, tag);
+		return (err);
+	}
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
+		dmu_buf_rele(db, tag);
+		dnode_rele(dn, tag);
+	}
+	return (err);
+}
+
+void
+zap_unlockdir(zap_t *zap, const void *tag)
+{
+	rw_exit(&zap->zap_rwlock);
+	dnode_rele(zap->zap_dnode, tag);
+	dmu_buf_rele(zap->zap_dbuf, tag);
+}
+
+static kmem_cache_t *zap_name_cache;
+static kmem_cache_t *zap_attr_cache;
+static kmem_cache_t *zap_name_long_cache;
+static kmem_cache_t *zap_attr_long_cache;
+
+void
+zap_init(void)
+{
+	zap_name_cache = kmem_cache_create("zap_name",
+	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
+	    NULL, NULL, NULL, 0);
+
+	zap_attr_cache = kmem_cache_create("zap_attr_cache",
+	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
+	    NULL, NULL, NULL, NULL, 0);
+
+	zap_name_long_cache = kmem_cache_create("zap_name_long",
+	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
+	    NULL, NULL, NULL, 0);
+
+	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
+	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
+	    NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zap_fini(void)
+{
+	kmem_cache_destroy(zap_name_cache);
+	kmem_cache_destroy(zap_attr_cache);
+	kmem_cache_destroy(zap_name_long_cache);
+	kmem_cache_destroy(zap_attr_long_cache);
+}
+
+zap_name_t *
+zap_name_alloc(zap_t *zap, boolean_t longname)
+{
+	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
+	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
+
+	zn->zn_zap = zap;
+	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
+	return (zn);
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
+		kmem_cache_free(zap_name_cache, zn);
+	} else {
+		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
+		kmem_cache_free(zap_name_long_cache, zn);
+	}
+}
+
+int
+zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
+{
+	zap_t *zap = zn->zn_zap;
+	size_t key_len = strlen(key) + 1;
+
+	/* Make sure zn is allocated for longname if key is long */
+	IMPLY(key_len > ZAP_MAXNAMELEN,
+	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
+
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = key;
+	zn->zn_key_orig_numints = key_len;
+	zn->zn_matchtype = mt;
+	zn->zn_normflags = zap->zap_normflags;
+
+	/*
+	 * If we're dealing with a case sensitive lookup on a mixed or
+	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
+	 * will fold case to all caps overriding the lookup request.
+	 */
+	if (mt & MT_MATCH_CASE)
+		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
+
+	if (zap->zap_normflags) {
+		/*
+		 * We *must* use zap_normflags because this normalization is
+		 * what the hash is computed from.
+		 */
+		if (zap_normalize(zap, key, zn->zn_normbuf,
+		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
+			return (SET_ERROR(ENOTSUP));
+		zn->zn_key_norm = zn->zn_normbuf;
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+	} else {
+		if (mt != 0)
+			return (SET_ERROR(ENOTSUP));
+		zn->zn_key_norm = zn->zn_key_orig;
+		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
+	}
+
+	zn->zn_hash = zap_hash(zn);
+
+	if (zap->zap_normflags != zn->zn_normflags) {
+		/*
+		 * We *must* use zn_normflags because this normalization is
+		 * what the matching is based on.  (Not the hash!)
+		 */
+		if (zap_normalize(zap, key, zn->zn_normbuf,
+		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
+			return (SET_ERROR(ENOTSUP));
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+	}
+
+	return (0);
+}
+
+zap_name_t *
+zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
+{
+	size_t key_len = strlen(key) + 1;
+	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
+	if (zap_name_init_str(zn, key, mt) != 0) {
+		zap_name_free(zn);
+		return (NULL);
+	}
+	return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
+
+	ASSERT0(zap->zap_normflags);
+	zn->zn_zap = zap;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = zn->zn_key_norm = key;
+	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+	zn->zn_matchtype = 0;
+	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
+
+	zn->zn_hash = zap_hash(zn);
+	return (zn);
+}
+
+uint64_t
+zap_getflags(zap_t *zap)
+{
+	if (zap->zap_ismicro)
+		return (0);
+	return (zap_f_phys(zap)->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return (48);
+	else
+		return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return ((1<<16)-1);
+	else
+		return (-1U);
+}
+
+uint64_t
+zap_hash(zap_name_t *zn)
+{
+	zap_t *zap = zn->zn_zap;
+	uint64_t h = 0;
+
+	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+		h = *(uint64_t *)zn->zn_key_orig;
+	} else {
+		h = zap->zap_salt;
+		ASSERT(h != 0);
+		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+			const uint64_t *wp = zn->zn_key_norm;
+
+			ASSERT(zn->zn_key_intlen == 8);
+			for (int i = 0; i < zn->zn_key_norm_numints;
+			    wp++, i++) {
+				uint64_t word = *wp;
+
+				for (int j = 0; j < 8; j++) {
+					h = (h >> 8) ^
+					    zfs_crc64_table[(h ^ word) & 0xFF];
+					word >>= NBBY;
+				}
+			}
+		} else {
+			const uint8_t *cp = zn->zn_key_norm;
+
+			/*
+			 * We previously stored the terminating null on
+			 * disk, but didn't hash it, so we need to
+			 * continue to not hash it.  (The
+			 * zn_key_*_numints includes the terminating
+			 * null for non-binary keys.)
+			 */
+			int len = zn->zn_key_norm_numints - 1;
+
+			ASSERT(zn->zn_key_intlen == 1);
+			for (int i = 0; i < len; cp++, i++) {
+				h = (h >> 8) ^
+				    zfs_crc64_table[(h ^ *cp) & 0xFF];
+			}
+		}
+	}
+	/*
+	 * Don't use all 64 bits, since we need some in the cookie for
+	 * the collision differentiator.  We MUST use the high bits,
+	 * since those are the ones that we first pay attention to when
+	 * choosing the bucket.
+	 */
+	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
+
+	return (h);
+}
+
+int
+zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
+    size_t outlen)
+{
+	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
+	size_t inlen = strlen(name) + 1;
+
+	int err = 0;
+	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
+	    U8_UNICODE_LATEST, &err);
+
+	return (err);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+	boolean_t res = B_FALSE;
+	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
+	if (zn->zn_matchtype & MT_NORMALIZE) {
+		size_t namelen = zn->zn_normbuf_len;
+		char normbuf[ZAP_MAXNAMELEN];
+		char *norm = normbuf;
+
+		/*
+		 * Cannot allocate this on-stack as it exceed the stack-limit of
+		 * 1024.
+		 */
+		if (namelen > ZAP_MAXNAMELEN)
+			norm = kmem_alloc(namelen, KM_SLEEP);
+
+		if (zap_normalize(zn->zn_zap, matchname, norm,
+		    zn->zn_normflags, namelen) != 0) {
+			res = B_FALSE;
+		} else {
+			res = (strcmp(zn->zn_key_norm, norm) == 0);
+		}
+		if (norm != normbuf)
+			kmem_free(norm, namelen);
+	} else {
+		res = (strcmp(zn->zn_key_orig, matchname) == 0);
+	}
+	return (res);
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+	uint64_t block_type = *(uint64_t *)buf;
+
+	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
+		mzap_byteswap(buf, size);
+	} else {
+		fzap_byteswap(buf, size);
+	}
+}
+
+void
+zap_evict_sync(void *dbu)
+{
+	zap_t *zap = dbu;
+
+	rw_destroy(&zap->zap_rwlock);
+
+	if (zap->zap_ismicro)
+		mze_destroy(zap);
+	else
+		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+	kmem_free(zap, sizeof (zap_t));
+}
+
+static zap_attribute_t *
+zap_attribute_alloc_impl(boolean_t longname)
+{
+	zap_attribute_t *za;
+
+	za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
+	    KM_SLEEP);
+	za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
+	return (za);
+}
+
+zap_attribute_t *
+zap_attribute_alloc(void)
+{
+	return (zap_attribute_alloc_impl(B_FALSE));
+}
+
+zap_attribute_t *
+zap_attribute_long_alloc(void)
+{
+	return (zap_attribute_alloc_impl(B_TRUE));
+}
+
+void
+zap_attribute_free(zap_attribute_t *za)
+{
+	if (za->za_name_len == ZAP_MAXNAMELEN) {
+		kmem_cache_free(zap_attr_cache, za);
+	} else {
+		ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
+		kmem_cache_free(zap_attr_long_cache, za);
+	}
+}
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 4e343ebf5d1..b094b113971 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -81,284 +81,7 @@ zap_get_micro_max_size(spa_t *spa)
 	return (SPA_OLD_MAXBLOCKSIZE);
 }
 
-static int mzap_upgrade(zap_t **zapp,
-    const void *tag, dmu_tx_t *tx, zap_flags_t flags);
-
-uint64_t
-zap_getflags(zap_t *zap)
-{
-	if (zap->zap_ismicro)
-		return (0);
-	return (zap_f_phys(zap)->zap_flags);
-}
-
-int
-zap_hashbits(zap_t *zap)
-{
-	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
-		return (48);
-	else
-		return (28);
-}
-
-uint32_t
-zap_maxcd(zap_t *zap)
-{
-	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
-		return ((1<<16)-1);
-	else
-		return (-1U);
-}
-
-static uint64_t
-zap_hash(zap_name_t *zn)
-{
-	zap_t *zap = zn->zn_zap;
-	uint64_t h = 0;
-
-	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
-		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
-		h = *(uint64_t *)zn->zn_key_orig;
-	} else {
-		h = zap->zap_salt;
-		ASSERT(h != 0);
-		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
-		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
-			const uint64_t *wp = zn->zn_key_norm;
-
-			ASSERT(zn->zn_key_intlen == 8);
-			for (int i = 0; i < zn->zn_key_norm_numints;
-			    wp++, i++) {
-				uint64_t word = *wp;
-
-				for (int j = 0; j < 8; j++) {
-					h = (h >> 8) ^
-					    zfs_crc64_table[(h ^ word) & 0xFF];
-					word >>= NBBY;
-				}
-			}
-		} else {
-			const uint8_t *cp = zn->zn_key_norm;
-
-			/*
-			 * We previously stored the terminating null on
-			 * disk, but didn't hash it, so we need to
-			 * continue to not hash it.  (The
-			 * zn_key_*_numints includes the terminating
-			 * null for non-binary keys.)
-			 */
-			int len = zn->zn_key_norm_numints - 1;
-
-			ASSERT(zn->zn_key_intlen == 1);
-			for (int i = 0; i < len; cp++, i++) {
-				h = (h >> 8) ^
-				    zfs_crc64_table[(h ^ *cp) & 0xFF];
-			}
-		}
-	}
-	/*
-	 * Don't use all 64 bits, since we need some in the cookie for
-	 * the collision differentiator.  We MUST use the high bits,
-	 * since those are the ones that we first pay attention to when
-	 * choosing the bucket.
-	 */
-	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
-
-	return (h);
-}
-
-static int
-zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
-    size_t outlen)
-{
-	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
-
-	size_t inlen = strlen(name) + 1;
-
-	int err = 0;
-	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
-	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
-	    U8_UNICODE_LATEST, &err);
-
-	return (err);
-}
-
-boolean_t
-zap_match(zap_name_t *zn, const char *matchname)
-{
-	boolean_t res = B_FALSE;
-	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
-
-	if (zn->zn_matchtype & MT_NORMALIZE) {
-		size_t namelen = zn->zn_normbuf_len;
-		char normbuf[ZAP_MAXNAMELEN];
-		char *norm = normbuf;
-
-		/*
-		 * Cannot allocate this on-stack as it exceed the stack-limit of
-		 * 1024.
-		 */
-		if (namelen > ZAP_MAXNAMELEN)
-			norm = kmem_alloc(namelen, KM_SLEEP);
-
-		if (zap_normalize(zn->zn_zap, matchname, norm,
-		    zn->zn_normflags, namelen) != 0) {
-			res = B_FALSE;
-		} else {
-			res = (strcmp(zn->zn_key_norm, norm) == 0);
-		}
-		if (norm != normbuf)
-			kmem_free(norm, namelen);
-	} else {
-		res = (strcmp(zn->zn_key_orig, matchname) == 0);
-	}
-	return (res);
-}
-
-static kmem_cache_t *zap_name_cache;
-static kmem_cache_t *zap_attr_cache;
-static kmem_cache_t *zap_name_long_cache;
-static kmem_cache_t *zap_attr_long_cache;
-
 void
-zap_init(void)
-{
-	zap_name_cache = kmem_cache_create("zap_name",
-	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
-	    NULL, NULL, NULL, 0);
-
-	zap_attr_cache = kmem_cache_create("zap_attr_cache",
-	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
-	    NULL, NULL, NULL, NULL, 0);
-
-	zap_name_long_cache = kmem_cache_create("zap_name_long",
-	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
-	    NULL, NULL, NULL, 0);
-
-	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
-	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
-	    NULL, NULL, NULL, NULL, 0);
-}
-
-void
-zap_fini(void)
-{
-	kmem_cache_destroy(zap_name_cache);
-	kmem_cache_destroy(zap_attr_cache);
-	kmem_cache_destroy(zap_name_long_cache);
-	kmem_cache_destroy(zap_attr_long_cache);
-}
-
-static zap_name_t *
-zap_name_alloc(zap_t *zap, boolean_t longname)
-{
-	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
-	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
-
-	zn->zn_zap = zap;
-	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
-	return (zn);
-}
-
-void
-zap_name_free(zap_name_t *zn)
-{
-	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
-		kmem_cache_free(zap_name_cache, zn);
-	} else {
-		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
-		kmem_cache_free(zap_name_long_cache, zn);
-	}
-}
-
-static int
-zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
-{
-	zap_t *zap = zn->zn_zap;
-	size_t key_len = strlen(key) + 1;
-
-	/* Make sure zn is allocated for longname if key is long */
-	IMPLY(key_len > ZAP_MAXNAMELEN,
-	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
-
-	zn->zn_key_intlen = sizeof (*key);
-	zn->zn_key_orig = key;
-	zn->zn_key_orig_numints = key_len;
-	zn->zn_matchtype = mt;
-	zn->zn_normflags = zap->zap_normflags;
-
-	/*
-	 * If we're dealing with a case sensitive lookup on a mixed or
-	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
-	 * will fold case to all caps overriding the lookup request.
-	 */
-	if (mt & MT_MATCH_CASE)
-		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
-
-	if (zap->zap_normflags) {
-		/*
-		 * We *must* use zap_normflags because this normalization is
-		 * what the hash is computed from.
-		 */
-		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
-			return (SET_ERROR(ENOTSUP));
-		zn->zn_key_norm = zn->zn_normbuf;
-		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
-	} else {
-		if (mt != 0)
-			return (SET_ERROR(ENOTSUP));
-		zn->zn_key_norm = zn->zn_key_orig;
-		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
-	}
-
-	zn->zn_hash = zap_hash(zn);
-
-	if (zap->zap_normflags != zn->zn_normflags) {
-		/*
-		 * We *must* use zn_normflags because this normalization is
-		 * what the matching is based on.  (Not the hash!)
-		 */
-		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
-			return (SET_ERROR(ENOTSUP));
-		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
-	}
-
-	return (0);
-}
-
-zap_name_t *
-zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
-{
-	size_t key_len = strlen(key) + 1;
-	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
-	if (zap_name_init_str(zn, key, mt) != 0) {
-		zap_name_free(zn);
-		return (NULL);
-	}
-	return (zn);
-}
-
-static zap_name_t *
-zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
-{
-	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
-
-	ASSERT0(zap->zap_normflags);
-	zn->zn_zap = zap;
-	zn->zn_key_intlen = sizeof (*key);
-	zn->zn_key_orig = zn->zn_key_norm = key;
-	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
-	zn->zn_matchtype = 0;
-	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
-
-	zn->zn_hash = zap_hash(zn);
-	return (zn);
-}
-
-static void
 mzap_byteswap(mzap_phys_t *buf, size_t size)
 {
 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
@@ -373,19 +96,6 @@ mzap_byteswap(mzap_phys_t *buf, size_t size)
 	}
 }
 
-void
-zap_byteswap(void *buf, size_t size)
-{
-	uint64_t block_type = *(uint64_t *)buf;
-
-	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
-		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
-		mzap_byteswap(buf, size);
-	} else {
-		fzap_byteswap(buf, size);
-	}
-}
-
 __attribute__((always_inline)) inline
 static int
 mze_compare(const void *arg1, const void *arg2)
@@ -417,7 +127,7 @@ mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
 	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
 }
 
-static mzap_ent_t *
+mzap_ent_t *
 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
 {
 	mzap_ent_t mze_tofind;
@@ -482,7 +192,7 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
  * Check if the current entry keeps the colliding entries under the fatzap leaf
  * size.
  */
-static boolean_t
+boolean_t
 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
 {
 	zap_t *zap = zn->zn_zap;
@@ -508,14 +218,14 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
 	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
 }
 
-static void
+void
 mze_destroy(zap_t *zap)
 {
 	zfs_btree_clear(&zap->zap_m.zap_tree);
 	zfs_btree_destroy(&zap->zap_m.zap_tree);
 }
 
-static zap_t *
+zap_t *
 mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
@@ -614,162 +324,7 @@ mzap_open(dmu_buf_t *db)
 	return (winner);
 }
 
-/*
- * This routine "consumes" the caller's hold on the dbuf, which must
- * have the specified tag.
- */
-static int
-zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
-{
-	ASSERT0(db->db_offset);
-	objset_t *os = dmu_buf_get_objset(db);
-	uint64_t obj = db->db_object;
-
-	*zapp = NULL;
-
-	if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
-		return (SET_ERROR(EINVAL));
-
-	zap_t *zap = dmu_buf_get_user(db);
-	if (zap == NULL) {
-		zap = mzap_open(db);
-		if (zap == NULL) {
-			/*
-			 * mzap_open() didn't like what it saw on-disk.
-			 * Check for corruption!
-			 */
-			return (SET_ERROR(EIO));
-		}
-	}
-
-	/*
-	 * We're checking zap_ismicro without the lock held, in order to
-	 * tell what type of lock we want.  Once we have some sort of
-	 * lock, see if it really is the right type.  In practice this
-	 * can only be different if it was upgraded from micro to fat,
-	 * and micro wanted WRITER but fat only needs READER.
-	 */
-	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
-	rw_enter(&zap->zap_rwlock, lt);
-	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
-		/* it was upgraded, now we only need reader */
-		ASSERT(lt == RW_WRITER);
-		ASSERT(RW_READER ==
-		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
-		rw_downgrade(&zap->zap_rwlock);
-		lt = RW_READER;
-	}
-
-	zap->zap_objset = os;
-	zap->zap_dnode = dn;
-
-	if (lt == RW_WRITER)
-		dmu_buf_will_dirty(db, tx);
-
-	ASSERT3P(zap->zap_dbuf, ==, db);
-
-	ASSERT(!zap->zap_ismicro ||
-	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
-	if (zap->zap_ismicro && tx && adding &&
-	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
-		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
-		if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
-			dprintf("upgrading obj %llu: num_entries=%u\n",
-			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
-			*zapp = zap;
-			int err = mzap_upgrade(zapp, tag, tx, 0);
-			if (err != 0)
-				rw_exit(&zap->zap_rwlock);
-			return (err);
-		}
-		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
-		zap->zap_m.zap_num_chunks =
-		    db->db_size / MZAP_ENT_LEN - 1;
-
-		if (newsz > SPA_OLD_MAXBLOCKSIZE) {
-			dsl_dataset_t *ds = dmu_objset_ds(os);
-			if (!dsl_dataset_feature_is_active(ds,
-			    SPA_FEATURE_LARGE_MICROZAP)) {
-				/*
-				 * A microzap just grew beyond the old limit
-				 * for the first time, so we have to ensure the
-				 * feature flag is activated.
-				 * zap_get_micro_max_size() won't let us get
-				 * here if the feature is not enabled, so we
-				 * don't need any other checks beforehand.
-				 *
-				 * Since we're in open context, we can't
-				 * activate the feature directly, so we instead
-				 * flag it on the dataset for next sync.
-				 */
-				dsl_dataset_dirty(ds, tx);
-				mutex_enter(&ds->ds_lock);
-				ds->ds_feature_activation
-				    [SPA_FEATURE_LARGE_MICROZAP] =
-				    (void *)B_TRUE;
-				mutex_exit(&ds->ds_lock);
-			}
-		}
-	}
-
-	*zapp = zap;
-	return (0);
-}
-
-static int
-zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
-    zap_t **zapp)
-{
-	dmu_buf_t *db;
-	int err;
-
-	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
-		dmu_buf_rele(db, tag);
-	else
-		VERIFY(dnode_add_ref(dn, tag));
-	return (err);
-}
-
 int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
-    zap_t **zapp)
-{
-	dnode_t *dn;
-	dmu_buf_t *db;
-	int err;
-
-	err = dnode_hold(os, obj, tag, &dn);
-	if (err != 0)
-		return (err);
-	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
-		dnode_rele(dn, tag);
-		return (err);
-	}
-	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
-		dmu_buf_rele(db, tag);
-		dnode_rele(dn, tag);
-	}
-	return (err);
-}
-
-void
-zap_unlockdir(zap_t *zap, const void *tag)
-{
-	rw_exit(&zap->zap_rwlock);
-	dnode_rele(zap->zap_dnode, tag);
-	dmu_buf_rele(zap->zap_dbuf, tag);
-}
-
-static int
 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
 	int err = 0;
@@ -861,217 +416,11 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	}
 }
 
-static uint64_t
-zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
-    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
-{
-	uint64_t obj;
-
-	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-
-	if (allocated_dnode == NULL) {
-		dnode_t *dn;
-		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
-		    indirect_blockshift, bonustype, bonuslen, dnodesize,
-		    &dn, FTAG, tx);
-		mzap_create_impl(dn, normflags, flags, tx);
-		dnode_rele(dn, FTAG);
-	} else {
-		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
-		    indirect_blockshift, bonustype, bonuslen, dnodesize,
-		    allocated_dnode, tag, tx);
-		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
-	}
-
-	return (obj);
-}
-
-int
-zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
-	    0, tx));
-}
-
-int
-zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (zap_create_claim_norm_dnsize(os, obj,
-	    0, ot, bonustype, bonuslen, dnodesize, tx));
-}
-
-int
-zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
-    dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
-	    bonuslen, 0, tx));
-}
-
-int
-zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
-    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int error;
-
-	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
-	    dnodesize, tx);
-	if (error != 0)
-		return (error);
-
-	error = dnode_hold(os, obj, FTAG, &dn);
-	if (error != 0)
-		return (error);
-
-	mzap_create_impl(dn, normflags, 0, tx);
-
-	dnode_rele(dn, FTAG);
-
-	return (0);
-}
-
-uint64_t
-zap_create(objset_t *os, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
-}
-
-uint64_t
-zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
-	    dnodesize, tx));
-}
-
-uint64_t
-zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
-	    0, tx));
-}
-
-uint64_t
-zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
-	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
-}
-
-uint64_t
-zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_flags_dnsize(os, normflags, flags, ot,
-	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
-}
-
-uint64_t
-zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
-	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
-	    tx));
-}
-
-/*
- * Create a zap object and return a pointer to the newly allocated dnode via
- * the allocated_dnode argument.  The returned dnode will be held and the
- * caller is responsible for releasing the hold by calling dnode_rele().
- */
-uint64_t
-zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
-    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
-{
-	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
-	    indirect_blockshift, bonustype, bonuslen, dnodesize,
-	    allocated_dnode, tag, tx));
-}
-
-int
-zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
-{
-	/*
-	 * dmu_object_free will free the object number and free the
-	 * data.  Freeing the data will cause our pageout function to be
-	 * called, which will destroy our data (zap_leaf_t's and zap_t).
-	 */
-
-	return (dmu_object_free(os, zapobj, tx));
-}
-
-void
-zap_evict_sync(void *dbu)
-{
-	zap_t *zap = dbu;
-
-	rw_destroy(&zap->zap_rwlock);
-
-	if (zap->zap_ismicro)
-		mze_destroy(zap);
-	else
-		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-
-	kmem_free(zap, sizeof (zap_t));
-}
-
-int
-zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_count(zap, count);
-	} else {
-		*count = zap->zap_m.zap_num_entries;
-	}
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_count_by_dnode(dnode_t *dn, uint64_t *count)
-{
-	zap_t *zap;
-
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
-	if (err != 0)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_count(zap, count);
-	} else {
-		*count = zap->zap_m.zap_num_entries;
-	}
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
 /*
  * zn may be NULL; if not specified, it will be computed if needed.
  * See also the comment above zap_entry_normalization_conflict().
  */
-static boolean_t
+boolean_t
 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
     zfs_btree_index_t *idx)
 {
@@ -1119,340 +468,7 @@ mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
 	return (B_FALSE);
 }
 
-/*
- * Routines for manipulating attributes.
- */
-
-int
-zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	return (zap_lookup_norm(os, zapobj, name, integer_size,
-	    num_integers, buf, 0, NULL, 0, NULL));
-}
-
-static int
-zap_lookup_impl(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	int err = 0;
-
-	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
-	if (zn == NULL)
-		return (SET_ERROR(ENOTSUP));
-
-	if (!zap->zap_ismicro) {
-		err = fzap_lookup(zn, integer_size, num_integers, buf,
-		    realname, rn_len, ncp, NULL);
-	} else {
-		zfs_btree_index_t idx;
-		mzap_ent_t *mze = mze_find(zn, &idx);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			if (num_integers < 1) {
-				err = SET_ERROR(EOVERFLOW);
-			} else if (integer_size != 8) {
-				err = SET_ERROR(EINVAL);
-			} else {
-				*(uint64_t *)buf =
-				    MZE_PHYS(zap, mze)->mze_value;
-				if (realname != NULL)
-					(void) strlcpy(realname,
-					    MZE_PHYS(zap, mze)->mze_name,
-					    rn_len);
-				if (ncp) {
-					*ncp = mzap_normalization_conflict(zap,
-					    zn, mze, &idx);
-				}
-			}
-		}
-	}
-	zap_name_free(zn);
-	return (err);
-}
-
-int
-zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
-	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
-{
-	zap_t *zap;
-	int err;
-	zap_name_t *zn;
-
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	zn = zap_name_alloc_str(zap, name, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	fzap_prefetch(zn);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_prefetch_object(objset_t *os, uint64_t zapobj)
-{
-	int error;
-	dmu_object_info_t doi;
-
-	error = dmu_object_info(os, zapobj, &doi);
-	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
-		error = SET_ERROR(EINVAL);
-	if (error == 0)
-		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
-
-	return (error);
-}
-
-int
-zap_lookup_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
-	    num_integers, buf, 0, NULL, 0, NULL));
-}
-
-int
-zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	zap_t *zap;
-
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
-	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-static int
-zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
-{
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	fzap_prefetch(zn);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (0);
-}
-
-int
-zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_prefetch_uint64_impl(zap, key, key_numints);
-	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_prefetch_uint64_impl(zap, key, key_numints);
-	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-static int
-zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
-    uint64_t *actual_num_integers)
-{
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	int err = fzap_lookup(zn, integer_size, num_integers, buf,
-	    NULL, 0, NULL, actual_num_integers);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, NULL);
-	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, NULL);
-	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
-    uint64_t *actual_num_integers)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, actual_num_integers);
-	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_contains(objset_t *os, uint64_t zapobj, const char *name)
-{
-	int err = zap_lookup_norm(os, zapobj, name, 0,
-	    0, NULL, 0, NULL, 0, NULL);
-	if (err == EOVERFLOW || err == EINVAL)
-		err = 0; /* found, but skipped reading the value */
-	return (err);
-}
-
-int
-zap_length(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if (!zap->zap_ismicro) {
-		err = fzap_length(zn, integer_size, num_integers);
-	} else {
-		zfs_btree_index_t idx;
-		mzap_ent_t *mze = mze_find(zn, &idx);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			if (integer_size)
-				*integer_size = 8;
-			if (num_integers)
-				*num_integers = 1;
-		}
-	}
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_length(zn, integer_size, num_integers);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
-    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_t *zap;
-
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_length(zn, integer_size, num_integers);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-static void
+void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
 	zap_t *zap = zn->zn_zap;
@@ -1495,612 +511,6 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 	cmn_err(CE_PANIC, "out of entries!");
 }
 
-static int
-zap_add_impl(zap_t *zap, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx, const void *tag)
-{
-	const uint64_t *intval = val;
-	int err = 0;
-
-	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if (!zap->zap_ismicro) {
-		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
-	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(key) >= MZAP_NAME_LEN ||
-	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
-		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
-		if (err == 0) {
-			err = fzap_add(zn, integer_size, num_integers, val,
-			    tag, tx);
-		}
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
-	} else {
-		zfs_btree_index_t idx;
-		if (mze_find(zn, &idx) != NULL) {
-			err = SET_ERROR(EEXIST);
-		} else {
-			mzap_addent(zn, *intval);
-		}
-	}
-	ASSERT(zap == zn->zn_zap);
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, tag);
-	return (err);
-}
-
-int
-zap_add(objset_t *os, uint64_t zapobj, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_add_by_dnode(dnode_t *dn, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-static int
-zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx, const void *tag)
-{
-	int err;
-
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, tag);
-	return (err);
-}
-
-int
-zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_update(objset_t *os, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	const uint64_t *intval = val;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if (!zap->zap_ismicro) {
-		err = fzap_update(zn, integer_size, num_integers, val,
-		    FTAG, tx);
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
-	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(name) >= MZAP_NAME_LEN) {
-		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    (u_longlong_t)zapobj, integer_size,
-		    (u_longlong_t)num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
-		if (err == 0) {
-			err = fzap_update(zn, integer_size, num_integers,
-			    val, FTAG, tx);
-		}
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
-	} else {
-		zfs_btree_index_t idx;
-		mzap_ent_t *mze = mze_find(zn, &idx);
-		if (mze != NULL) {
-			MZE_PHYS(zap, mze)->mze_value = *intval;
-		} else {
-			mzap_addent(zn, *intval);
-		}
-	}
-	ASSERT(zap == zn->zn_zap);
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-static int
-zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
-    const void *tag)
-{
-	int err;
-
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
-	zap = zn->zn_zap;	/* fzap_update() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, tag);
-	return (err);
-}
-
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers, const void *val,
-    dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_update_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_update_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_update_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_update_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
-{
-	return (zap_remove_norm(os, zapobj, name, 0, tx));
-}
-
-static int
-zap_remove_impl(zap_t *zap, const char *name,
-    matchtype_t mt, dmu_tx_t *tx)
-{
-	int err = 0;
-
-	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
-	if (zn == NULL)
-		return (SET_ERROR(ENOTSUP));
-	if (!zap->zap_ismicro) {
-		err = fzap_remove(zn, tx);
-	} else {
-		zfs_btree_index_t idx;
-		mzap_ent_t *mze = mze_find(zn, &idx);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			zap->zap_m.zap_num_entries--;
-			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
-			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
-		}
-	}
-	zap_name_free(zn);
-	return (err);
-}
-
-int
-zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
-    matchtype_t mt, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	err = zap_remove_impl(zap, name, mt, tx);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	err = zap_remove_impl(zap, name, 0, tx);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-static int
-zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
-    dmu_tx_t *tx, const void *tag)
-{
-	int err;
-
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, tag);
-	return (err);
-}
-
-int
-zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
-	/* zap_remove_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
-    dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
-	/* zap_remove_uint64_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-
-static zap_attribute_t *
-zap_attribute_alloc_impl(boolean_t longname)
-{
-	zap_attribute_t *za;
-
-	za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
-	    KM_SLEEP);
-	za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
-	return (za);
-}
-
-zap_attribute_t *
-zap_attribute_alloc(void)
-{
-	return (zap_attribute_alloc_impl(B_FALSE));
-}
-
-zap_attribute_t *
-zap_attribute_long_alloc(void)
-{
-	return (zap_attribute_alloc_impl(B_TRUE));
-}
-
-void
-zap_attribute_free(zap_attribute_t *za)
-{
-	if (za->za_name_len == ZAP_MAXNAMELEN) {
-		kmem_cache_free(zap_attr_cache, za);
-	} else {
-		ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
-		kmem_cache_free(zap_attr_long_cache, za);
-	}
-}
-
-/*
- * Routines for iterating over the attributes.
- */
-
-static void
-zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized, boolean_t prefetch)
-{
-	zc->zc_objset = os;
-	zc->zc_zap = NULL;
-	zc->zc_leaf = NULL;
-	zc->zc_zapobj = zapobj;
-	zc->zc_serialized = serialized;
-	zc->zc_hash = 0;
-	zc->zc_cd = 0;
-	zc->zc_prefetch = prefetch;
-}
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized)
-{
-	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
-}
-
-/*
- * Initialize a cursor at the beginning of the ZAP object.  The entire
- * ZAP object will be prefetched.
- */
-void
-zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
-	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
-}
-
-/*
- * Initialize a cursor at the beginning, but request that we not prefetch
- * the entire ZAP object.
- */
-void
-zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
-	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
-}
-
-void
-zap_cursor_fini(zap_cursor_t *zc)
-{
-	if (zc->zc_zap) {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap, NULL);
-		zc->zc_zap = NULL;
-	}
-	if (zc->zc_leaf) {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
-	}
-	zc->zc_objset = NULL;
-}
-
-uint64_t
-zap_cursor_serialize(zap_cursor_t *zc)
-{
-	if (zc->zc_hash == -1ULL)
-		return (-1ULL);
-	if (zc->zc_zap == NULL)
-		return (zc->zc_serialized);
-	ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
-	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
-
-	/*
-	 * We want to keep the high 32 bits of the cursor zero if we can, so
-	 * that 32-bit programs can access this.  So usually use a small
-	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
-	 * of the cursor.
-	 *
-	 * [ collision differentiator | zap_hashbits()-bit hash value ]
-	 */
-	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
-	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
-}
-
-int
-zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
-{
-	int err;
-
-	if (zc->zc_hash == -1ULL)
-		return (SET_ERROR(ENOENT));
-
-	if (zc->zc_zap == NULL) {
-		int hb;
-		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
-		if (err != 0)
-			return (err);
-
-		/*
-		 * To support zap_cursor_init_serialized, advance, retrieve,
-		 * we must add to the existing zc_cd, which may already
-		 * be 1 due to the zap_cursor_advance.
-		 */
-		ASSERT0(zc->zc_hash);
-		hb = zap_hashbits(zc->zc_zap);
-		zc->zc_hash = zc->zc_serialized << (64 - hb);
-		zc->zc_cd += zc->zc_serialized >> hb;
-		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
-			zc->zc_cd = 0;
-	} else {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-	}
-	if (!zc->zc_zap->zap_ismicro) {
-		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
-	} else {
-		zfs_btree_index_t idx;
-		mzap_ent_t mze_tofind;
-
-		mze_tofind.mze_hash = zc->zc_hash >> 32;
-		mze_tofind.mze_cd = zc->zc_cd;
-
-		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
-		    &mze_tofind, &idx);
-		if (mze == NULL) {
-			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
-			    &idx, &idx);
-		}
-		if (mze) {
-			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
-			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
-			za->za_normalization_conflict =
-			    mzap_normalization_conflict(zc->zc_zap, NULL,
-			    mze, &idx);
-			za->za_integer_length = 8;
-			za->za_num_integers = 1;
-			za->za_first_integer = mzep->mze_value;
-			(void) strlcpy(za->za_name, mzep->mze_name,
-			    za->za_name_len);
-			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
-			zc->zc_cd = mze->mze_cd;
-			err = 0;
-		} else {
-			zc->zc_hash = -1ULL;
-			err = SET_ERROR(ENOENT);
-		}
-	}
-	rw_exit(&zc->zc_zap->zap_rwlock);
-	return (err);
-}
-
-void
-zap_cursor_advance(zap_cursor_t *zc)
-{
-	if (zc->zc_hash == -1ULL)
-		return;
-	zc->zc_cd++;
-}
-
-int
-zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-
-	memset(zs, 0, sizeof (zap_stats_t));
-
-	if (zap->zap_ismicro) {
-		zs->zs_blocksize = zap->zap_dbuf->db_size;
-		zs->zs_num_entries = zap->zap_m.zap_num_entries;
-		zs->zs_num_blocks = 1;
-	} else {
-		fzap_get_stats(zap, zs);
-	}
-	zap_unlockdir(zap, FTAG);
-	return (0);
-}
-
-#if defined(_KERNEL)
-EXPORT_SYMBOL(zap_create);
-EXPORT_SYMBOL(zap_create_dnsize);
-EXPORT_SYMBOL(zap_create_norm);
-EXPORT_SYMBOL(zap_create_norm_dnsize);
-EXPORT_SYMBOL(zap_create_flags);
-EXPORT_SYMBOL(zap_create_flags_dnsize);
-EXPORT_SYMBOL(zap_create_claim);
-EXPORT_SYMBOL(zap_create_claim_norm);
-EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
-EXPORT_SYMBOL(zap_create_hold);
-EXPORT_SYMBOL(zap_destroy);
-EXPORT_SYMBOL(zap_lookup);
-EXPORT_SYMBOL(zap_lookup_by_dnode);
-EXPORT_SYMBOL(zap_lookup_norm);
-EXPORT_SYMBOL(zap_lookup_uint64);
-EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode);
-EXPORT_SYMBOL(zap_contains);
-EXPORT_SYMBOL(zap_prefetch);
-EXPORT_SYMBOL(zap_prefetch_uint64);
-EXPORT_SYMBOL(zap_prefetch_object);
-EXPORT_SYMBOL(zap_add);
-EXPORT_SYMBOL(zap_add_by_dnode);
-EXPORT_SYMBOL(zap_add_uint64);
-EXPORT_SYMBOL(zap_add_uint64_by_dnode);
-EXPORT_SYMBOL(zap_update);
-EXPORT_SYMBOL(zap_update_uint64);
-EXPORT_SYMBOL(zap_update_uint64_by_dnode);
-EXPORT_SYMBOL(zap_length);
-EXPORT_SYMBOL(zap_length_uint64);
-EXPORT_SYMBOL(zap_length_uint64_by_dnode);
-EXPORT_SYMBOL(zap_remove);
-EXPORT_SYMBOL(zap_remove_by_dnode);
-EXPORT_SYMBOL(zap_remove_norm);
-EXPORT_SYMBOL(zap_remove_uint64);
-EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
-EXPORT_SYMBOL(zap_count);
-EXPORT_SYMBOL(zap_count_by_dnode);
-EXPORT_SYMBOL(zap_value_search);
-EXPORT_SYMBOL(zap_join);
-EXPORT_SYMBOL(zap_join_increment);
-EXPORT_SYMBOL(zap_add_int);
-EXPORT_SYMBOL(zap_remove_int);
-EXPORT_SYMBOL(zap_lookup_int);
-EXPORT_SYMBOL(zap_increment_int);
-EXPORT_SYMBOL(zap_add_int_key);
-EXPORT_SYMBOL(zap_lookup_int_key);
-EXPORT_SYMBOL(zap_increment);
-EXPORT_SYMBOL(zap_cursor_init);
-EXPORT_SYMBOL(zap_cursor_fini);
-EXPORT_SYMBOL(zap_cursor_retrieve);
-EXPORT_SYMBOL(zap_cursor_advance);
-EXPORT_SYMBOL(zap_cursor_serialize);
-EXPORT_SYMBOL(zap_cursor_init_serialized);
-EXPORT_SYMBOL(zap_get_stats);
-
 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
 	"Maximum micro ZAP size before converting to a fat ZAP, "
 	    "in bytes (max 1M)");
-#endif

From bb304d33bb5d7f027487c4635fd7ed35aa52ba65 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 9 May 2026 20:33:26 +1000
Subject: [PATCH 017/129] zap: public interface cleanup

- reorganising functions into groups, collections of the variants of the
  same function.
- matching header order to source order, to make it a little easier to
  find things.
- moving per-function documentation from source to header.
- adding light documentation to functions that had none.

No actual code changes.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18516
---
 include/sys/zap.h | 193 ++++++++++++++++++++++++---------------
 module/zfs/zap.c  | 223 ++++++++++++++++++++++++++--------------------
 2 files changed, 249 insertions(+), 167 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index 66fbc1385d2..69f021034ba 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -24,6 +24,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2026, TrueNAS.
  */
 
 #ifndef	_SYS_ZAP_H
@@ -121,13 +122,13 @@ typedef enum zap_flags {
 /*
  * Create a new zapobj with no attributes and return its object number.
  */
-uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+uint64_t zap_create(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
+uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
-uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
+uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
+uint64_t zap_create_norm_dnsize(objset_t *os, int normflags,
     dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dmu_tx_t *tx);
 uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
@@ -137,11 +138,22 @@ uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
     zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dmu_tx_t *tx);
+
+/*
+ * Create a zap object and return a pointer to the newly allocated dnode via
+ * the allocated_dnode argument.  The returned dnode will be held and the
+ * caller is responsible for releasing the hold by calling dnode_rele().
+ */
 uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx);
 
+/*
+ * Create a new zapobj with no attributes, and add an entry to an existing
+ * zapobj with the given name as key and the object number of the new zapobj as
+ * the value. Returns the object number of the new zapobj.
+ */
 uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
     uint64_t parent_obj, const char *name, dmu_tx_t *tx);
 uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
@@ -157,20 +169,21 @@ void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags,
  * Create a new zapobj with no attributes from the given (unallocated)
  * object number.
  */
-int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
-int zap_create_claim_norm(objset_t *ds, uint64_t obj,
+int zap_create_claim_norm(objset_t *os, uint64_t obj,
     int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
+int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj,
     int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
 
 /*
- * The zapobj passed in must be a valid ZAP object for all of the
- * following routines.
+ * All operations on a zapobj take either the the objset/objectid pair
+ * that "names" the object, or an existing dnode_t for the object. The
+ * zapobj passed in must be a valid ZAP object.
  */
 
 /*
@@ -178,7 +191,7 @@ int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
  *
  * Frees the object number using dmu_object_free.
  */
-int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
 
 /*
  * Manipulate attributes.
@@ -207,21 +220,32 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
  * fit will be transferred to 'buf'.  If the entire attribute was not
  * transferred, the call will return EOVERFLOW.
  */
-int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf);
 
 /*
  * If rn_len is nonzero, realname will be set to the name of the found
  * entry (which may be different from the requested name if matchtype is
- * not MT_EXACT).
+ * not zero).
  *
  * If normalization_conflictp is not NULL, it will be set if there is
  * another name with the same case/unicode normalized form.
  */
-int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *normalization_conflictp);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp);
+
+/*
+ * The _uint64 variants take an array of uint64_t as the key. The ZAP must
+ * be created with ZAP_FLAG_UINT64_KEY.
+ */
 int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
 int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -229,20 +253,30 @@ int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 int zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
     uint64_t *actual_num_integers);
-int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+
+/*
+ * Lookup the attribute with the given name. Returns ENOENT if it does not
+ * exist, 0 if it does. This is like zap_lookup(), but may be more efficient.
+ */
+int zap_contains(objset_t *os, uint64_t zapobj, const char *name);
+
+/*
+ * Prefetch the blocks within the ZAP where the given key is stored. The
+ * prefetch IO will occure in the background.
+ */
 int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
-int zap_prefetch_object(objset_t *os, uint64_t zapobj);
+
+/* Prefetch by uint64_t[] key. */
 int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints);
 int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints);
 
-int zap_lookup_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
-int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp);
+/*
+ * Prefetch the entire ZAP object. Unlike zap_prefetch(), will block until
+ * the entire object is loaded into the ARC.
+ */
+int zap_prefetch_object(objset_t *os, uint64_t zapobj);
 
 /*
  * Create an attribute with the given name and value.
@@ -250,13 +284,15 @@ int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
  * If an attribute with the given name already exists, the call will
  * fail and return EEXIST.
  */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
+int zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
 int zap_add_by_dnode(dnode_t *dn, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
-int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+
+/* Add by uint64_t[] key. */
+int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
 int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -271,8 +307,10 @@ int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  * existing attribute's integer size, in which case the attribute's
  * integer size will be updated to the new value.
  */
-int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/* Update by uint64_t[] key. */
 int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
@@ -287,8 +325,10 @@ int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  * If the requested attribute does not exist, the call will fail and
  * return ENOENT.
  */
-int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers);
+
+/* Attribute length by uint64_t[] key. */
 int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers);
 int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -300,10 +340,12 @@ int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  * If the specified attribute does not exist, the call will fail and
  * return ENOENT.
  */
-int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
-int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
-    matchtype_t mt, dmu_tx_t *tx);
+int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx);
 int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
+int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+    matchtype_t mt, dmu_tx_t *tx);
+
+/* Remove by uint64_t[] key. */
 int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx);
 int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -313,9 +355,17 @@ int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  * Returns (in *count) the number of attributes in the specified zap
  * object.
  */
-int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count);
 int zap_count_by_dnode(dnode_t *dn, uint64_t *count);
 
+/*
+ * Lookup an existing uint64 value, add the delta value to it, and store
+ * update it with the new value. If the new value is 0, removes the key
+ * entirely.
+ */
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx);
+
 /*
  * Returns (in name) the name of the entry whose (value & mask)
  * (za_first_integer) is value, or ENOENT if not found.  The string
@@ -358,22 +408,12 @@ int zap_update_int_key(objset_t *os, uint64_t obj,
 int zap_lookup_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t *valuep);
 
-int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
-    dmu_tx_t *tx);
-
-struct zap;
-struct zap_leaf;
-typedef struct zap_cursor {
-	/* This structure is opaque! */
-	objset_t *zc_objset;
-	struct zap *zc_zap;
-	struct zap_leaf *zc_leaf;
-	uint64_t zc_zapobj;
-	uint64_t zc_serialized;
-	uint64_t zc_hash;
-	uint32_t zc_cd;
-	boolean_t zc_prefetch;
-} zap_cursor_t;
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one.  The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
 
 typedef struct {
 	int za_integer_length;
@@ -389,9 +429,6 @@ typedef struct {
 	char za_name[];
 } zap_attribute_t;
 
-void zap_init(void);
-void zap_fini(void);
-
 /*
  * Alloc and free zap_attribute_t.
  */
@@ -399,21 +436,44 @@ zap_attribute_t *zap_attribute_alloc(void);
 zap_attribute_t *zap_attribute_long_alloc(void);
 void zap_attribute_free(zap_attribute_t *attrp);
 
-/*
- * The interface for listing all the attributes of a zapobj can be
- * thought of as cursor moving down a list of the attributes one by
- * one.  The cookie returned by the zap_cursor_serialize routine is
- * persistent across system calls (and across reboot, even).
- */
+struct zap;
+struct zap_leaf;
+typedef struct zap_cursor {
+	/* This structure is opaque! */
+	objset_t *zc_objset;
+	struct zap *zc_zap;
+	struct zap_leaf *zc_leaf;
+	uint64_t zc_zapobj;
+	uint64_t zc_serialized;
+	uint64_t zc_hash;
+	uint32_t zc_cd;
+	boolean_t zc_prefetch;
+} zap_cursor_t;
 
 /*
- * Initialize a zap cursor, pointing to the "first" attribute of the
- * zapobj.  You must _fini the cursor when you are done with it.
+ * Initialize a zap cursor, pointing to the "first" attribute of the zapobj.
+ * The entire zapobj will be prefetched. You must call zap_cursor_fini the
+ * cursor when you are done with it.
  */
 void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
+void zap_cursor_fini(zap_cursor_t *zc);
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
 void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
     uint64_t zapobj);
-void zap_cursor_fini(zap_cursor_t *zc);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument).  You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,
+    uint64_t zapobj, uint64_t serialized);
 
 /*
  * Get the attribute currently pointed to by the cursor.  Returns
@@ -435,17 +495,6 @@ void zap_cursor_advance(zap_cursor_t *zc);
  */
 uint64_t zap_cursor_serialize(zap_cursor_t *zc);
 
-/*
- * Initialize a zap cursor pointing to the position recorded by
- * zap_cursor_serialize (in the "serialized" argument).  You can also
- * use a "serialized" argument of 0 to start at the beginning of the
- * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
- * zap_cursor_init(...).)
- */
-void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
-    uint64_t zapobj, uint64_t serialized);
-
-
 #define	ZAP_HISTOGRAM_SIZE 10
 
 typedef struct zap_stats {
@@ -535,7 +584,11 @@ typedef struct zap_stats {
  * statistics.  This interface shouldn't be relied on unless you really
  * know what you're doing.
  */
-int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs);
+
+/* ZAP subsystem setup/teardown */
+void zap_init(void);
+void zap_fini(void);
 
 #ifdef	__cplusplus
 }
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index fa3f8b836c9..4c4aec07c91 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -26,6 +26,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2026, TrueNAS.
  */
 
 #include <sys/zfs_context.h>
@@ -36,6 +37,8 @@
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
+/* zap_create */
+
 static uint64_t
 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
@@ -113,11 +116,8 @@ zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
 	    tx));
 }
 
-/*
- * Create a zap object and return a pointer to the newly allocated dnode via
- * the allocated_dnode argument.  The returned dnode will be held and the
- * caller is responsible for releasing the hold by calling dnode_rele().
- */
+/* zap_crate_hold */
+
 uint64_t
 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
@@ -129,6 +129,8 @@ zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
 	    allocated_dnode, tag, tx));
 }
 
+/* zap_create_link */
+
 uint64_t
 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
     const char *name, dmu_tx_t *tx)
@@ -150,6 +152,8 @@ zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
 	return (new_obj);
 }
 
+/* zap_create_claim */
+
 int
 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
@@ -200,6 +204,8 @@ zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
 	return (0);
 }
 
+/* zap_destroy */
+
 int
 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 {
@@ -212,9 +218,7 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 	return (dmu_object_free(os, zapobj, tx));
 }
 
-/*
- * Routines for manipulating attributes.
- */
+/* zap_lookup */
 
 static int
 zap_lookup_impl(zap_t *zap, const char *name,
@@ -267,6 +271,14 @@ zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
 	    num_integers, buf, 0, NULL, 0, NULL));
 }
 
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, 0, NULL, 0, NULL));
+}
+
 int
 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
@@ -285,6 +297,26 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+
+	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+/* zap_lookup_uint64 */
+
 static int
 zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
@@ -303,7 +335,6 @@ zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
 	return (err);
 }
 
-
 int
 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
@@ -353,6 +384,8 @@ zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	return (err);
 }
 
+/* zap_contains */
+
 int
 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 {
@@ -363,6 +396,8 @@ zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 	return (err);
 }
 
+/* zap_prefetch */
+
 int
 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 {
@@ -385,21 +420,7 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 	return (err);
 }
 
-int
-zap_prefetch_object(objset_t *os, uint64_t zapobj)
-{
-	int error;
-	dmu_object_info_t doi;
-
-	error = dmu_object_info(os, zapobj, &doi);
-	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
-		error = SET_ERROR(EINVAL);
-	if (error == 0)
-		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
-
-	return (error);
-}
-
+/* zap_prefetch_uint64 */
 
 static int
 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
@@ -445,32 +466,25 @@ zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
 	return (err);
 }
 
-int
-zap_lookup_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
-	    num_integers, buf, 0, NULL, 0, NULL));
-}
+/* zap_prefetch_object */
 
 int
-zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
+zap_prefetch_object(objset_t *os, uint64_t zapobj)
 {
-	zap_t *zap;
+	int error;
+	dmu_object_info_t doi;
 
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
-	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
-	return (err);
+	error = dmu_object_info(os, zapobj, &doi);
+	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
+		error = SET_ERROR(EINVAL);
+	if (error == 0)
+		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
+
+	return (error);
 }
 
+/* zap_add */
+
 static int
 zap_add_impl(zap_t *zap, const char *key,
     int integer_size, uint64_t num_integers,
@@ -543,6 +557,8 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	return (err);
 }
 
+/* zap_add_uint64 */
+
 static int
 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
@@ -597,6 +613,8 @@ zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	return (err);
 }
 
+/* zap_update */
+
 int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
@@ -644,6 +662,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+/* zap_update_uint64 */
+
 static int
 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
@@ -697,6 +717,8 @@ zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
 	return (err);
 }
 
+/* zap_length */
+
 int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
@@ -731,6 +753,8 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+/* zap_length_uint64 */
+
 int
 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
@@ -773,6 +797,8 @@ zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	return (err);
 }
 
+/* zap_remove */
+
 static int
 zap_remove_impl(zap_t *zap, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
@@ -805,6 +831,20 @@ zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 	return (zap_remove_norm(os, zapobj, name, 0, tx));
 }
 
+int
+zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	err = zap_remove_impl(zap, name, 0, tx);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
 int
 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
@@ -820,19 +860,7 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
-int
-zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	err = zap_remove_impl(zap, name, 0, tx);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
+/* zap_remove_uint64 */
 
 static int
 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
@@ -881,6 +909,8 @@ zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
 	return (err);
 }
 
+/* zap_count */
+
 int
 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 {
@@ -917,9 +947,29 @@ zap_count_by_dnode(dnode_t *dn, uint64_t *count)
 	return (err);
 }
 
-/*
- * Helper functions for consumers.
- */
+/* zap_increment */
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx)
+{
+	uint64_t value = 0;
+
+	if (delta == 0)
+		return (0);
+
+	int err = zap_lookup(os, obj, name, 8, 1, &value);
+	if (err != 0 && err != ENOENT)
+		return (err);
+	value += delta;
+	if (value == 0)
+		err = zap_remove(os, obj, name, tx);
+	else
+		err = zap_update(os, obj, name, 8, 1, &value, tx);
+	return (err);
+}
+
+/* zap_value_search */
 
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
@@ -946,6 +996,8 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 	return (err);
 }
 
+/* zap_join */
+
 int
 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 {
@@ -1025,6 +1077,9 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 	zap_attribute_free(za);
 	return (err);
 }
+
+/* zap_*_int */
+
 int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
@@ -1062,6 +1117,8 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
 	return (zap_increment(os, obj, name, delta, tx));
 }
 
+/* zap_*_int_key */
+
 int
 zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
@@ -1091,29 +1148,7 @@ zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 	return (zap_lookup(os, obj, name, 8, 1, valuep));
 }
 
-int
-zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
-    dmu_tx_t *tx)
-{
-	uint64_t value = 0;
-
-	if (delta == 0)
-		return (0);
-
-	int err = zap_lookup(os, obj, name, 8, 1, &value);
-	if (err != 0 && err != ENOENT)
-		return (err);
-	value += delta;
-	if (value == 0)
-		err = zap_remove(os, obj, name, tx);
-	else
-		err = zap_update(os, obj, name, 8, 1, &value, tx);
-	return (err);
-}
-
-/*
- * Routines for iterating over the attributes.
- */
+/* zap_cursor */
 
 static void
 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
@@ -1129,26 +1164,25 @@ zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
 	zc->zc_prefetch = prefetch;
 }
 
-/*
- * Initialize a cursor at the beginning of the ZAP object.  The entire
- * ZAP object will be prefetched.
- */
 void
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
 	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
 }
 
-/*
- * Initialize a cursor at the beginning, but request that we not prefetch
- * the entire ZAP object.
- */
 void
 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
 	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
 }
 
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+}
+
 void
 zap_cursor_fini(zap_cursor_t *zc)
 {
@@ -1262,12 +1296,7 @@ zap_cursor_serialize(zap_cursor_t *zc)
 	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized)
-{
-	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
-}
+/* zap_get_stats */
 
 int
 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)

From 00a941ea093737950b8e5aa14a1c45ec4848be7d Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 9 May 2026 22:19:23 +1000
Subject: [PATCH 018/129] zap: internal interface cleanup

Similar to previous, though a much lighter touch because these are not
"public" interfaces.

- reorganising functions into groups, by rough function class.
- matching header order to source order, to make it a little easier to
  find things.
- adding light documentation to functions that had none.

Note that I've not added any documentation for the mzap_* and fzap_*
functions, as part of this commit series is laying the groundwork to
hide those functions in their backend modules; such documentation would
become obsolete very quickly.

No actual code changes.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18516
---
 include/sys/zap_impl.h |  77 ++++--
 module/zfs/zap_impl.c  | 519 +++++++++++++++++++++--------------------
 2 files changed, 325 insertions(+), 271 deletions(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 78c57e522bc..15cc96df3d1 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -26,6 +26,7 @@
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2026, TrueNAS.
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
@@ -33,7 +34,6 @@
 
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
-#include <sys/avl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -170,6 +170,9 @@ typedef struct zap {
 	} zap_u;
 } zap_t;
 
+#define	zap_f	zap_u.zap_fat
+#define	zap_m	zap_u.zap_micro
+
 static inline zap_phys_t *
 zap_f_phys(zap_t *zap)
 {
@@ -182,6 +185,10 @@ zap_m_phys(zap_t *zap)
 	return (zap->zap_dbuf->db_data);
 }
 
+/*
+ * zap_name_t carries the original key and whatever we've derived from it
+ * (normalised form, hash, etc) as we work through completing the operation.
+ */
 typedef struct zap_name {
 	zap_t *zn_zap;
 	int zn_key_intlen;
@@ -196,35 +203,74 @@ typedef struct zap_name {
 	char zn_normbuf[];
 } zap_name_t;
 
-#define	zap_f	zap_u.zap_fat
-#define	zap_m	zap_u.zap_micro
+/*
+ * Allocate a zap_name_t. The longname flag ensures there is enough room to
+ * hold a long filename when the 'longname' pool feature is active.
+ */
+zap_name_t *zap_name_alloc(zap_t *zap, boolean_t longname);
 
+/*
+ * Allocate a zap_name_t for the given key. zap_name_init_str() will be
+ * called to normalise the key and initialise the struct.
+ */
+zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
+
+/*
+ * Allocate a zap_name_t for a uint64 array key.
+ */
+zap_name_t *zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints);
+
+/*
+ * Free a zap_name_t.
+ */
+void zap_name_free(zap_name_t *zn);
+
+/*
+ * Initialise an existing zap_name_t with the normalised form of the key,
+ * computed according to the given matchtype.
+ */
+int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt);
+
+/*
+ * Compare 'matchname' with the name represented by the zap_name_t, applying
+ * the same normalisation method first. Returns true if the normalised forms
+ * match, false otherwise.
+ */
 boolean_t zap_match(zap_name_t *zn, const char *matchname);
+
+/*
+ * Compute and return the 64-bit hash for the name, according to the name
+ * type and hash flags.
+ */
+uint64_t zap_hash(zap_name_t *zn);
+
+/*
+ * Return a zap_t for the given on-disk object, locked and ready for use.
+ * The zap_t will be allocated and loaded from disk if its not already loaded.
+ */
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp);
 int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp);
+
+/* Underlying implementation for above; do not use. */
 int zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
+
+/* Unlock and release a zap_t. */
 void zap_unlockdir(zap_t *zap, const void *tag);
+
+/* zap_t release function for when associated dbuf is evicted. */
 void zap_evict_sync(void *dbu);
-zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key,
-    int numints);
-zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
-int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt);
-zap_name_t * zap_name_alloc(zap_t *zap, boolean_t longname);
-void zap_name_free(zap_name_t *zn);
+
+/* Misc internal state & config. */
 int zap_hashbits(zap_t *zap);
 uint32_t zap_maxcd(zap_t *zap);
 uint64_t zap_getflags(zap_t *zap);
-int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
-    size_t outlen);
-uint64_t zap_hash(zap_name_t *zn);
-
-uint64_t zap_get_micro_max_size(spa_t *spa);
 
+/* Microzap implementation. */
 zap_t *mzap_open(dmu_buf_t *db);
 int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx,
     zap_flags_t flags);
@@ -235,7 +281,9 @@ boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn,
     mzap_ent_t *mze, zfs_btree_index_t *idx);
 void mzap_addent(zap_name_t *zn, uint64_t value);
 void mzap_byteswap(mzap_phys_t *buf, size_t size);
+uint64_t zap_get_micro_max_size(spa_t *spa);
 
+/* Fatzap implementation. */
 void fzap_byteswap(void *buf, size_t size);
 int fzap_count(zap_t *zap, uint64_t *count);
 int fzap_lookup(zap_name_t *zn,
@@ -254,7 +302,6 @@ int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
 int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
 void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
 void zap_put_leaf(struct zap_leaf *l);
-
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx);
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index 8788480318f..c70fce67875 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -26,6 +26,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2024, Klara, Inc.
+ * Copyright (c) 2026, TrueNAS.
  */
 
 #include <sys/zfs_context.h>
@@ -35,6 +36,255 @@
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 
+static kmem_cache_t *zap_name_cache;
+static kmem_cache_t *zap_attr_cache;
+static kmem_cache_t *zap_name_long_cache;
+static kmem_cache_t *zap_attr_long_cache;
+
+/* Setup/teardown caches. Part of the public interface in zap.h. */
+void
+zap_init(void)
+{
+	zap_name_cache = kmem_cache_create("zap_name",
+	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
+	    NULL, NULL, NULL, 0);
+
+	zap_attr_cache = kmem_cache_create("zap_attr_cache",
+	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
+	    NULL, NULL, NULL, NULL, 0);
+
+	zap_name_long_cache = kmem_cache_create("zap_name_long",
+	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
+	    NULL, NULL, NULL, 0);
+
+	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
+	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
+	    NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zap_fini(void)
+{
+	kmem_cache_destroy(zap_name_cache);
+	kmem_cache_destroy(zap_attr_cache);
+	kmem_cache_destroy(zap_name_long_cache);
+	kmem_cache_destroy(zap_attr_long_cache);
+}
+
+static int
+zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
+    size_t outlen)
+{
+	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
+	size_t inlen = strlen(name) + 1;
+
+	int err = 0;
+	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
+	    U8_UNICODE_LATEST, &err);
+
+	return (err);
+}
+
+zap_name_t *
+zap_name_alloc(zap_t *zap, boolean_t longname)
+{
+	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
+	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
+
+	zn->zn_zap = zap;
+	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
+	return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
+{
+	size_t key_len = strlen(key) + 1;
+	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
+	if (zap_name_init_str(zn, key, mt) != 0) {
+		zap_name_free(zn);
+		return (NULL);
+	}
+	return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
+
+	ASSERT0(zap->zap_normflags);
+	zn->zn_zap = zap;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = zn->zn_key_norm = key;
+	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+	zn->zn_matchtype = 0;
+	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
+
+	zn->zn_hash = zap_hash(zn);
+	return (zn);
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
+		kmem_cache_free(zap_name_cache, zn);
+	} else {
+		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
+		kmem_cache_free(zap_name_long_cache, zn);
+	}
+}
+
+int
+zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
+{
+	zap_t *zap = zn->zn_zap;
+	size_t key_len = strlen(key) + 1;
+
+	/* Make sure zn is allocated for longname if key is long */
+	IMPLY(key_len > ZAP_MAXNAMELEN,
+	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
+
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = key;
+	zn->zn_key_orig_numints = key_len;
+	zn->zn_matchtype = mt;
+	zn->zn_normflags = zap->zap_normflags;
+
+	/*
+	 * If we're dealing with a case sensitive lookup on a mixed or
+	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
+	 * will fold case to all caps overriding the lookup request.
+	 */
+	if (mt & MT_MATCH_CASE)
+		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
+
+	if (zap->zap_normflags) {
+		/*
+		 * We *must* use zap_normflags because this normalization is
+		 * what the hash is computed from.
+		 */
+		if (zap_normalize(zap, key, zn->zn_normbuf,
+		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
+			return (SET_ERROR(ENOTSUP));
+		zn->zn_key_norm = zn->zn_normbuf;
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+	} else {
+		if (mt != 0)
+			return (SET_ERROR(ENOTSUP));
+		zn->zn_key_norm = zn->zn_key_orig;
+		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
+	}
+
+	zn->zn_hash = zap_hash(zn);
+
+	if (zap->zap_normflags != zn->zn_normflags) {
+		/*
+		 * We *must* use zn_normflags because this normalization is
+		 * what the matching is based on.  (Not the hash!)
+		 */
+		if (zap_normalize(zap, key, zn->zn_normbuf,
+		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
+			return (SET_ERROR(ENOTSUP));
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+	}
+
+	return (0);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+	boolean_t res = B_FALSE;
+	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
+	if (zn->zn_matchtype & MT_NORMALIZE) {
+		size_t namelen = zn->zn_normbuf_len;
+		char normbuf[ZAP_MAXNAMELEN];
+		char *norm = normbuf;
+
+		/*
+		 * Cannot allocate this on-stack as it exceed the stack-limit of
+		 * 1024.
+		 */
+		if (namelen > ZAP_MAXNAMELEN)
+			norm = kmem_alloc(namelen, KM_SLEEP);
+
+		if (zap_normalize(zn->zn_zap, matchname, norm,
+		    zn->zn_normflags, namelen) != 0) {
+			res = B_FALSE;
+		} else {
+			res = (strcmp(zn->zn_key_norm, norm) == 0);
+		}
+		if (norm != normbuf)
+			kmem_free(norm, namelen);
+	} else {
+		res = (strcmp(zn->zn_key_orig, matchname) == 0);
+	}
+	return (res);
+}
+
+uint64_t
+zap_hash(zap_name_t *zn)
+{
+	zap_t *zap = zn->zn_zap;
+	uint64_t h = 0;
+
+	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+		h = *(uint64_t *)zn->zn_key_orig;
+	} else {
+		h = zap->zap_salt;
+		ASSERT(h != 0);
+		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+			const uint64_t *wp = zn->zn_key_norm;
+
+			ASSERT(zn->zn_key_intlen == 8);
+			for (int i = 0; i < zn->zn_key_norm_numints;
+			    wp++, i++) {
+				uint64_t word = *wp;
+
+				for (int j = 0; j < 8; j++) {
+					h = (h >> 8) ^
+					    zfs_crc64_table[(h ^ word) & 0xFF];
+					word >>= NBBY;
+				}
+			}
+		} else {
+			const uint8_t *cp = zn->zn_key_norm;
+
+			/*
+			 * We previously stored the terminating null on
+			 * disk, but didn't hash it, so we need to
+			 * continue to not hash it.  (The
+			 * zn_key_*_numints includes the terminating
+			 * null for non-binary keys.)
+			 */
+			int len = zn->zn_key_norm_numints - 1;
+
+			ASSERT(zn->zn_key_intlen == 1);
+			for (int i = 0; i < len; cp++, i++) {
+				h = (h >> 8) ^
+				    zfs_crc64_table[(h ^ *cp) & 0xFF];
+			}
+		}
+	}
+	/*
+	 * Don't use all 64 bits, since we need some in the cookie for
+	 * the collision differentiator.  We MUST use the high bits,
+	 * since those are the ones that we first pay attention to when
+	 * choosing the bucket.
+	 */
+	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
+
+	return (h);
+}
+
 /*
  * This routine "consumes" the caller's hold on the dbuf, which must
  * have the specified tag.
@@ -190,146 +440,19 @@ zap_unlockdir(zap_t *zap, const void *tag)
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
-static kmem_cache_t *zap_name_cache;
-static kmem_cache_t *zap_attr_cache;
-static kmem_cache_t *zap_name_long_cache;
-static kmem_cache_t *zap_attr_long_cache;
-
 void
-zap_init(void)
+zap_evict_sync(void *dbu)
 {
-	zap_name_cache = kmem_cache_create("zap_name",
-	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
-	    NULL, NULL, NULL, 0);
+	zap_t *zap = dbu;
 
-	zap_attr_cache = kmem_cache_create("zap_attr_cache",
-	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
-	    NULL, NULL, NULL, NULL, 0);
+	rw_destroy(&zap->zap_rwlock);
 
-	zap_name_long_cache = kmem_cache_create("zap_name_long",
-	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
-	    NULL, NULL, NULL, 0);
+	if (zap->zap_ismicro)
+		mze_destroy(zap);
+	else
+		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 
-	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
-	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
-	    NULL, NULL, NULL, NULL, 0);
-}
-
-void
-zap_fini(void)
-{
-	kmem_cache_destroy(zap_name_cache);
-	kmem_cache_destroy(zap_attr_cache);
-	kmem_cache_destroy(zap_name_long_cache);
-	kmem_cache_destroy(zap_attr_long_cache);
-}
-
-zap_name_t *
-zap_name_alloc(zap_t *zap, boolean_t longname)
-{
-	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
-	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
-
-	zn->zn_zap = zap;
-	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
-	return (zn);
-}
-
-void
-zap_name_free(zap_name_t *zn)
-{
-	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
-		kmem_cache_free(zap_name_cache, zn);
-	} else {
-		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
-		kmem_cache_free(zap_name_long_cache, zn);
-	}
-}
-
-int
-zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
-{
-	zap_t *zap = zn->zn_zap;
-	size_t key_len = strlen(key) + 1;
-
-	/* Make sure zn is allocated for longname if key is long */
-	IMPLY(key_len > ZAP_MAXNAMELEN,
-	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
-
-	zn->zn_key_intlen = sizeof (*key);
-	zn->zn_key_orig = key;
-	zn->zn_key_orig_numints = key_len;
-	zn->zn_matchtype = mt;
-	zn->zn_normflags = zap->zap_normflags;
-
-	/*
-	 * If we're dealing with a case sensitive lookup on a mixed or
-	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
-	 * will fold case to all caps overriding the lookup request.
-	 */
-	if (mt & MT_MATCH_CASE)
-		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
-
-	if (zap->zap_normflags) {
-		/*
-		 * We *must* use zap_normflags because this normalization is
-		 * what the hash is computed from.
-		 */
-		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
-			return (SET_ERROR(ENOTSUP));
-		zn->zn_key_norm = zn->zn_normbuf;
-		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
-	} else {
-		if (mt != 0)
-			return (SET_ERROR(ENOTSUP));
-		zn->zn_key_norm = zn->zn_key_orig;
-		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
-	}
-
-	zn->zn_hash = zap_hash(zn);
-
-	if (zap->zap_normflags != zn->zn_normflags) {
-		/*
-		 * We *must* use zn_normflags because this normalization is
-		 * what the matching is based on.  (Not the hash!)
-		 */
-		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
-			return (SET_ERROR(ENOTSUP));
-		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
-	}
-
-	return (0);
-}
-
-zap_name_t *
-zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
-{
-	size_t key_len = strlen(key) + 1;
-	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
-	if (zap_name_init_str(zn, key, mt) != 0) {
-		zap_name_free(zn);
-		return (NULL);
-	}
-	return (zn);
-}
-
-zap_name_t *
-zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
-{
-	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
-
-	ASSERT0(zap->zap_normflags);
-	zn->zn_zap = zap;
-	zn->zn_key_intlen = sizeof (*key);
-	zn->zn_key_orig = zn->zn_key_norm = key;
-	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
-	zn->zn_matchtype = 0;
-	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
-
-	zn->zn_hash = zap_hash(zn);
-	return (zn);
+	kmem_free(zap, sizeof (zap_t));
 }
 
 uint64_t
@@ -358,112 +481,7 @@ zap_maxcd(zap_t *zap)
 		return (-1U);
 }
 
-uint64_t
-zap_hash(zap_name_t *zn)
-{
-	zap_t *zap = zn->zn_zap;
-	uint64_t h = 0;
-
-	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
-		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
-		h = *(uint64_t *)zn->zn_key_orig;
-	} else {
-		h = zap->zap_salt;
-		ASSERT(h != 0);
-		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
-		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
-			const uint64_t *wp = zn->zn_key_norm;
-
-			ASSERT(zn->zn_key_intlen == 8);
-			for (int i = 0; i < zn->zn_key_norm_numints;
-			    wp++, i++) {
-				uint64_t word = *wp;
-
-				for (int j = 0; j < 8; j++) {
-					h = (h >> 8) ^
-					    zfs_crc64_table[(h ^ word) & 0xFF];
-					word >>= NBBY;
-				}
-			}
-		} else {
-			const uint8_t *cp = zn->zn_key_norm;
-
-			/*
-			 * We previously stored the terminating null on
-			 * disk, but didn't hash it, so we need to
-			 * continue to not hash it.  (The
-			 * zn_key_*_numints includes the terminating
-			 * null for non-binary keys.)
-			 */
-			int len = zn->zn_key_norm_numints - 1;
-
-			ASSERT(zn->zn_key_intlen == 1);
-			for (int i = 0; i < len; cp++, i++) {
-				h = (h >> 8) ^
-				    zfs_crc64_table[(h ^ *cp) & 0xFF];
-			}
-		}
-	}
-	/*
-	 * Don't use all 64 bits, since we need some in the cookie for
-	 * the collision differentiator.  We MUST use the high bits,
-	 * since those are the ones that we first pay attention to when
-	 * choosing the bucket.
-	 */
-	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
-
-	return (h);
-}
-
-int
-zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
-    size_t outlen)
-{
-	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
-
-	size_t inlen = strlen(name) + 1;
-
-	int err = 0;
-	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
-	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
-	    U8_UNICODE_LATEST, &err);
-
-	return (err);
-}
-
-boolean_t
-zap_match(zap_name_t *zn, const char *matchname)
-{
-	boolean_t res = B_FALSE;
-	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
-
-	if (zn->zn_matchtype & MT_NORMALIZE) {
-		size_t namelen = zn->zn_normbuf_len;
-		char normbuf[ZAP_MAXNAMELEN];
-		char *norm = normbuf;
-
-		/*
-		 * Cannot allocate this on-stack as it exceed the stack-limit of
-		 * 1024.
-		 */
-		if (namelen > ZAP_MAXNAMELEN)
-			norm = kmem_alloc(namelen, KM_SLEEP);
-
-		if (zap_normalize(zn->zn_zap, matchname, norm,
-		    zn->zn_normflags, namelen) != 0) {
-			res = B_FALSE;
-		} else {
-			res = (strcmp(zn->zn_key_norm, norm) == 0);
-		}
-		if (norm != normbuf)
-			kmem_free(norm, namelen);
-	} else {
-		res = (strcmp(zn->zn_key_orig, matchname) == 0);
-	}
-	return (res);
-}
-
+/* DNU byteswap callback for DMU_BSWAP_ZAP, see dmu_ot_byteswap. */
 void
 zap_byteswap(void *buf, size_t size)
 {
@@ -477,21 +495,10 @@ zap_byteswap(void *buf, size_t size)
 	}
 }
 
-void
-zap_evict_sync(void *dbu)
-{
-	zap_t *zap = dbu;
-
-	rw_destroy(&zap->zap_rwlock);
-
-	if (zap->zap_ismicro)
-		mze_destroy(zap);
-	else
-		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-
-	kmem_free(zap, sizeof (zap_t));
-}
-
+/*
+ * Cursor attribute allocator/free. Part of the public interface in zap.h,
+ * in this file to get access to the kmem caches.
+ */
 static zap_attribute_t *
 zap_attribute_alloc_impl(boolean_t longname)
 {

From eaaea55b69319948e05aed393ad5ba02ec85902a Mon Sep 17 00:00:00 2001
From: Garth Snyder <garth@garthsnyder.com>
Date: Tue, 12 May 2026 08:49:55 -0600
Subject: [PATCH 019/129] Consistently encode DRR_BEGIN packed nvlist payloads
 with NV_ENCODE_XDR

Currently, zfs send generates a mix of nvlist encodings in DRR_BEGIN
records, some XDR and some in native byte order. The result is that
most streams currently can't be zfs received on opposite-endian systems.

zfs send generates the outer wrappers for compound streams in userspace,
and it explicitly requests NV_ENCODE_XDR format for those records. But
the BEGIN records for individual datasets are generated on the kernel
side, in dmu_send.c, where fnvlist_pack() is used for encoding. That
routine hard-wires NV_ENCODE_NATIVE format.

This PR replaces the fnvlist_pack() call with a direct call to
nvlist_pack() that specifies NV_ENCODE_XDR.

Tests are included to verify that native-encoded nvlists are not
generated by any kernel path that attaches nvlists to BEGIN records.
There's also a check for XDR encoding in the outer wrapper of
replication streams in case there is ever a regression there.

There are also two tests that have a chance of triggering (and
detecting) bug #18491. Non-triggering versions of those tests are
already included here, so when that bug is more fully characterized,
the tests can be moved to a more directly relevant category. (They
are the two tests with _with_write suffixes.)

This PR adds to zstream dump an output line that shows the exact
encoding of any nvlists in BEGIN records. This feature is used by
the tests to validate streams.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Garth Snyder <garth@garthsnyder.com>
Closes #18360
Closes #18372
---
 cmd/zstream/zstream_dump.c                    |  14 +++
 module/zfs/dmu_send.c                         |  37 +++++-
 tests/runfiles/common.run                     |   9 ++
 tests/test-runner/bin/zts-report.py.in        |   2 +
 tests/zfs-tests/tests/Makefile.am             |  18 +++
 .../functional/send_xdr_encoding/cleanup.ksh  |  27 ++++
 .../send_xdr_encoding/send_xdr_encoding.cfg   |  25 ++++
 .../send_xdr_encoding.kshlib                  |  71 +++++++++++
 .../functional/send_xdr_encoding/setup.ksh    |  29 +++++
 .../send_xdr_encoding/xdr_bookmark_raw.ksh    |  93 ++++++++++++++
 .../xdr_bookmark_raw_with_write.ksh           | 107 ++++++++++++++++
 .../xdr_incr_from_bookmark.ksh                |  88 +++++++++++++
 .../xdr_incr_from_redacted.ksh                |  96 +++++++++++++++
 .../functional/send_xdr_encoding/xdr_raw.ksh  |  67 ++++++++++
 .../send_xdr_encoding/xdr_redacted_full.ksh   |  72 +++++++++++
 .../xdr_redacted_received.ksh                 |  84 +++++++++++++
 .../xdr_redacted_received_raw.ksh             |  97 +++++++++++++++
 .../send_xdr_encoding/xdr_replication.ksh     |  90 ++++++++++++++
 .../send_xdr_encoding/xdr_resume.ksh          |  73 +++++++++++
 .../xdr_resume_bookmark_raw.ksh               | 103 ++++++++++++++++
 .../xdr_resume_bookmark_raw_with_write.ksh    | 116 ++++++++++++++++++
 .../send_xdr_encoding/xdr_resume_raw.ksh      |  79 ++++++++++++
 .../send_xdr_encoding/xdr_resume_redacted.ksh |  86 +++++++++++++
 23 files changed, 1481 insertions(+), 2 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh
 create mode 100644 tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg
 create mode 100644 tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh
 create mode 100755 tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh

diff --git a/cmd/zstream/zstream_dump.c b/cmd/zstream/zstream_dump.c
index 6ccc57204c8..7757ee3b175 100644
--- a/cmd/zstream/zstream_dump.c
+++ b/cmd/zstream/zstream_dump.c
@@ -385,6 +385,20 @@ zstream_do_dump(int argc, char *argv[])
 				(void) ssread(buf, sz, &zc);
 				if (ferror(send_stream))
 					perror("fread");
+
+				uint8_t *nv_header = (uint8_t *)buf;
+				boolean_t xdr = nv_header[0] == NV_ENCODE_XDR;
+				boolean_t big_endian = nv_header[1] == 0;
+				const char *nc;
+				if (xdr) {
+					nc = "NV_ENCODE_XDR";
+				} else if (big_endian) {
+					nc = "NV_ENCODE_NATIVE (big-endian)";
+				} else {
+					nc = "NV_ENCODE_NATIVE (little-endian)";
+				}
+				printf("nvlist encoding = %s\n", nc);
+
 				err = nvlist_unpack(buf, sz, &nv, 0);
 				if (err) {
 					perror(strerror(err));
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 4c354722e4f..d931d9432f0 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2241,6 +2241,37 @@ setup_send_progress(struct dmu_send_params *dspp)
 	return (dssp);
 }
 
+/*
+ * Payloads must be multiples of 8 bytes for historical compatibility, but
+ * XDR-encoded nvlists are sized in multiples of 4 bytes and may need padding.
+ *
+ * Here we do the simplest possible thing and copy the data to a separate
+ * buffer. Not ideal in terms of performance and memory use, but most BEGIN
+ * nvlists are small or absent, the allocation is momentary, and we'll need
+ * to do this at most once per dataset.
+ *
+ * It's OK if there is extra data after a packed nvlist on the receiving
+ * side because packed nvlists have an internal end-of-list marker.
+ *
+ * The new buffer is allocated with kmem_alloc() and can be freed with
+ * fnvlist_pack_free(), like the original.
+ */
+static inline void
+pad_packed_nvlist(char **buffer, size_t *size)
+{
+	size_t size_in = *size;
+	size_t extra_bytes = P2ROUNDUP(size_in, 8) - size_in;
+	if (extra_bytes != 0) {
+		size_t expanded_size = size_in + extra_bytes;
+		char *longbuf = kmem_alloc(expanded_size, KM_SLEEP);
+		memcpy(longbuf, *buffer, size_in);
+		memset(longbuf + size_in, 0, extra_bytes);
+		fnvlist_pack_free(*buffer, size_in);
+		*buffer = longbuf;
+		*size = expanded_size;
+	}
+}
+
 /*
  * Actually do the bulk of the work in a zfs send.
  *
@@ -2474,7 +2505,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
 
 	dsl_pool_rele(dp, tag);
 
-	void *payload = NULL;
+	char *payload = NULL;
 	size_t payload_len = 0;
 	nvlist_t *nvl = fnvlist_alloc();
 
@@ -2548,7 +2579,9 @@ dmu_send_impl(struct dmu_send_params *dspp)
 	}
 
 	if (!nvlist_empty(nvl)) {
-		payload = fnvlist_pack(nvl, &payload_len);
+		VERIFY0(nvlist_pack(nvl, &payload, &payload_len,
+		    NV_ENCODE_XDR, KM_SLEEP));
+		pad_packed_nvlist(&payload, &payload_len);
 		drr->drr_payloadlen = payload_len;
 	}
 
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 14e4bd79f85..f18835da74b 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -1025,6 +1025,15 @@ tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
     'scrub_mirror_003_pos', 'scrub_mirror_004_pos']
 tags = ['functional', 'scrub_mirror']
 
+[tests/functional/send_xdr_encoding]
+tests = ['xdr_bookmark_raw', 'xdr_bookmark_raw_with_write',
+    'xdr_incr_from_bookmark', 'xdr_incr_from_redacted', 'xdr_raw',
+    'xdr_redacted_full', 'xdr_redacted_received',
+    'xdr_redacted_received_raw', 'xdr_replication', 'xdr_resume',
+    'xdr_resume_bookmark_raw', 'xdr_resume_bookmark_raw_with_write',
+    'xdr_resume_raw', 'xdr_resume_redacted']
+tags = ['functional', 'send_xdr_encoding']
+
 [tests/functional/slog]
 tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
     'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 29d2760ccb8..2cbd2f02a31 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -253,6 +253,8 @@ maybe = {
     'renameat2/setup': ['SKIP', renameat2_reason],
     'reservation/reservation_008_pos': ['FAIL', 7741],
     'reservation/reservation_018_pos': ['FAIL', 5642],
+    'send_xdr_encoding/xdr_bookmark_raw_with_write': ['FAIL', 18491],
+    'send_xdr_encoding/xdr_resume_bookmark_raw_with_write': ['FAIL', 18491],
     'snapshot/clone_001_pos': ['FAIL', known_reason],
     'snapshot/snapshot_006_pos': ['FAIL', known_reason],
     'snapshot/snapshot_009_pos': ['FAIL', 7961],
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 28acc6f3af1..5dd350ece7c 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -376,6 +376,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
 	functional/rsend/rsend.kshlib \
 	functional/scrub_mirror/default.cfg \
 	functional/scrub_mirror/scrub_mirror_common.kshlib \
+	functional/send_xdr_encoding/send_xdr_encoding.cfg \
+	functional/send_xdr_encoding/send_xdr_encoding.kshlib \
 	functional/slog/slog.cfg \
 	functional/slog/slog.kshlib \
 	functional/snapshot/snapshot.cfg \
@@ -2129,6 +2131,22 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/scrub_mirror/scrub_mirror_003_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_004_pos.ksh \
 	functional/scrub_mirror/setup.ksh \
+	functional/send_xdr_encoding/cleanup.ksh \
+	functional/send_xdr_encoding/setup.ksh \
+	functional/send_xdr_encoding/xdr_bookmark_raw.ksh \
+	functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh \
+	functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh \
+	functional/send_xdr_encoding/xdr_incr_from_redacted.ksh \
+	functional/send_xdr_encoding/xdr_raw.ksh \
+	functional/send_xdr_encoding/xdr_redacted_full.ksh \
+	functional/send_xdr_encoding/xdr_redacted_received.ksh \
+	functional/send_xdr_encoding/xdr_redacted_received_raw.ksh \
+	functional/send_xdr_encoding/xdr_replication.ksh \
+	functional/send_xdr_encoding/xdr_resume.ksh \
+	functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh \
+	functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh \
+	functional/send_xdr_encoding/xdr_resume_raw.ksh \
+	functional/send_xdr_encoding/xdr_resume_redacted.ksh \
 	functional/slog/cleanup.ksh \
 	functional/slog/setup.ksh \
 	functional/slog/slog_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh
new file mode 100755
index 00000000000..8261885e651
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh
@@ -0,0 +1,27 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+destroy_pool $POOL
+destroy_pool $POOL2
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg b/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg
new file mode 100644
index 00000000000..e4999a3ca29
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+read -r DISK1 DISK2 _ <<<"$DISKS"
+export DISK1 DISK2
+
+export POOL=$TESTPOOL
+export POOL2=$TESTPOOL2
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib b/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
new file mode 100644
index 00000000000..8e36b748439
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
@@ -0,0 +1,71 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg
+
+#
+# Verify that the DRR_BEGIN records in the given send stream encode their
+# nvlist payloads with NV_ENCODE_XDR (and not NV_ENCODE_NATIVE).
+#
+# DRR_BEGIN records that carry an nvlist payload (raw sends, redacted sends,
+# resumed sends, and combinations thereof) must encode that payload with
+# NV_ENCODE_XDR so the resulting stream can be portably consumed across
+# endianness. Encoding the payload with NV_ENCODE_NATIVE produces a stream
+# that is unreadable on a receiver of the opposite endianness.
+#
+# zstream dump prints a single "nvlist encoding = ..." line per DRR_BEGIN
+# record that carries an nvlist payload. The possible values are:
+#
+#     NV_ENCODE_XDR
+#     NV_ENCODE_NATIVE (big-endian)
+#     NV_ENCODE_NATIVE (little-endian)
+#
+# Every test in this suite generates a stream whose DRR_BEGIN record
+# carries an nvlist payload, so the pass criterion is:
+#
+#   - At least one NV_ENCODE_XDR line appears, AND
+#   - No NV_ENCODE_NATIVE line appears.
+#
+# Requiring at least one XDR line catches the case where zstream dump
+# itself fails before producing any encoding output. Asserting on dump
+# content rather than dump exit status means a partial dump can still
+# fail the test on an NV_ENCODE_NATIVE seen before the failure point.
+#
+function verify_xdr_nvlist_encoding
+{
+	typeset stream=$1
+	typeset out
+
+	[[ -f "$stream" ]] || \
+	    log_fail "verify_xdr_nvlist_encoding: stream not found: $stream"
+
+	out=$(zstream dump "$stream" 2>/dev/null)
+
+	if echo "$out" | grep -q 'NV_ENCODE_NATIVE'; then
+		log_fail "verify_xdr_nvlist_encoding: " \
+		    "NV_ENCODE_NATIVE found in $stream"
+	fi
+	if ! echo "$out" | grep -q 'NV_ENCODE_XDR'; then
+		log_fail "verify_xdr_nvlist_encoding: " \
+		    "no NV_ENCODE_XDR found in $stream"
+	fi
+}
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh
new file mode 100755
index 00000000000..609acba3a22
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh
@@ -0,0 +1,29 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+verify_disk_count "$DISKS" 2
+
+create_pool $POOL $DISK1
+create_pool $POOL2 $DISK2
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh
new file mode 100755
index 00000000000..9ba10d9e605
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh
@@ -0,0 +1,93 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A raw incremental send from a redaction bookmark on an encrypted dataset
+# (zfs send -w -i ds#book ds@snap) carries both BEGINNV_REDACT_FROM_SNAPS
+# and crypt_keydata in its DRR_BEGIN nvlist payload. Verify that this
+# combined payload is XDR-encoded and the stream can be received.
+#
+# Strategy:
+# 1. Create an encrypted source dataset with a redaction bookmark and a
+#    later snapshot.
+# 2. Establish a raw base on the receiver via zfs send -w of the bookmark's
+#    source snapshot.
+# 3. zfs send -w -i sendfs#book sendfs@s1 to a file.
+# 4. Verify that the resulting stream is XDR-encoded.
+# 5. Verify that the zfs receive succeeds.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_bookmark_raw_src"
+clonefs="$POOL/xdr_bookmark_raw_clone"
+recvfs="$POOL2/xdr_bookmark_raw_recv"
+keyfile="/$POOL/xdr_bookmark_raw.key"
+full_stream="/$POOL/xdr_bookmark_raw_full.zsend"
+incr_stream="/$POOL/xdr_bookmark_raw_incr.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $keyfile $full_stream $incr_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a raw incremental from a redaction bookmark is " \
+    "XDR-encoded and receivable"
+
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $sendfs
+
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+# The clone inherits encryption from $sendfs.
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Take @s1 with no intervening writes. See xdr_bookmark_raw_with_write.ksh
+# for a variant that includes a post-redact write; that variant exercises
+# a known kernel-side issue (#18491) and may flake.
+log_must zfs snapshot $sendfs@s1
+
+# Establish a raw base on the receiver.
+log_must eval "zfs send -w $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $recvfs < $full_stream"
+
+# Raw incremental from the redaction bookmark. This is the test focus.
+log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \
+    $incr_stream"
+verify_xdr_nvlist_encoding $incr_stream
+log_must eval "zfs receive $recvfs < $incr_stream"
+
+log_pass "BEGIN nvlist of a raw incremental from a redaction bookmark is " \
+    "XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh
new file mode 100755
index 00000000000..c58735f04d4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh
@@ -0,0 +1,107 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# This is the post-redact-write variant of xdr_bookmark_raw, separated out
+# because of a known issue (#18491) that causes it to fail roughly 30% of
+# the time. It's included here as a test for issue #18491 until the exact
+# source of that problem can be pinned down more specifically.
+#
+# Known issue: openzfs/zfs#18491
+#
+# On a freshly-created pool, `zfs send -w -i ds#book ds@snap` intermittently
+# fails with EACCES whenever there is data-modifying activity between the
+# `zfs redact` that created the bookmark and the subsequent send. This EACCES
+# is surfaced to userspace as the misleading message "dataset key must be
+# loaded," although the key remains loaded throughout.
+#
+# The reproducer script included in the issue report typically triggers the
+# problem within about 10 iterations on a fresh pool. Disk-sync mitigations
+# (zpool sync, with or without `-f`, with or without sleep, single or doubled,
+# applied at any reasonable point) do not avert the problem. CI runs that
+# include the test in this file reproduce the failure regularly (though
+# intermittently) across multiple distributions. xdr_resume_bookmark_raw.ksh
+# removes the post-redact write (which is not essential to the test) and
+# therefore runs reliably.
+#
+# When this test fails, the failure marker is the libzfs warning
+# "dataset key must be loaded" on stderr from the first `zfs send -w -i`
+# line below (the one that produces the stream we then truncate), and a
+# non-zero exit from that send. The test does not attempt to distinguish
+# the known-issue failure from other possible failures.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_bookmark_raw_with_write_src"
+clonefs="$POOL/xdr_bookmark_raw_with_write_clone"
+recvfs="$POOL2/xdr_bookmark_raw_with_write_recv"
+keyfile="/$POOL/xdr_bookmark_raw_with_write.key"
+full_stream="/$POOL/xdr_bookmark_raw_with_write_full.zsend"
+incr_stream="/$POOL/xdr_bookmark_raw_with_write_incr.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $keyfile $full_stream $incr_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a raw incremental from a redaction bookmark, " \
+    "with a post-redact write, is XDR-encoded and receivable " \
+    "(known to flake; see openzfs/zfs#18491)"
+
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $sendfs
+
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+# The clone inherits encryption from $sendfs.
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Post-redact write: the trigger for openzfs/zfs#18491.
+log_must dd if=/dev/urandom of=/$sendfs/f3 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s1
+
+# Establish a raw base on the receiver.
+log_must eval "zfs send -w $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $recvfs < $full_stream"
+
+# The next line is what races. On failure it exits with EACCES rendered
+# as "dataset key must be loaded".
+log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \
+    $incr_stream"
+verify_xdr_nvlist_encoding $incr_stream
+log_must eval "zfs receive $recvfs < $incr_stream"
+
+log_pass "BEGIN nvlist of a raw incremental from a redaction bookmark, " \
+    "with a post-redact write, is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh
new file mode 100755
index 00000000000..ab04f6aa603
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh
@@ -0,0 +1,88 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# An incremental send from a redaction bookmark (zfs send -i ds#book ds@snap)
+# carries BEGINNV_REDACT_FROM_SNAPS in its DRR_BEGIN nvlist payload (via the
+# from_rl path). Verify that this payload is XDR-encoded and the stream can
+# be received.
+#
+# Strategy:
+# 1. Create a source dataset with a redaction bookmark.
+# 2. Send a redacted full stream from that bookmark's source snapshot
+#    and receive it into a second pool as a base.
+# 3. Add data and a new snapshot on the source.
+# 4. zfs send -i sendfs#redaction-bookmark sendfs@snap to a file.
+# 5. Verify XDR encoding in the resulting stream.
+# 6. Verify that zfs receive of the stream succeeds.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_incr_from_bookmark_src"
+clonefs="$POOL/xdr_incr_from_bookmark_clone"
+recvfs="$POOL2/xdr_incr_from_bookmark_recv"
+full_stream="/$POOL/xdr_incr_from_bookmark_full.zsend"
+incr_stream="/$POOL/xdr_incr_from_bookmark_incr.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $full_stream $incr_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of an incremental send from a redaction bookmark " \
+    "is XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Establish a base on the receiver.
+log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $recvfs < $full_stream"
+
+# Add a new snapshot on the source for the incremental.
+log_must dd if=/dev/urandom of=/$sendfs/f3 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s1
+
+# Generate an incremental send from the redaction bookmark. This fires
+# BEGINNV_REDACT_FROM_SNAPS via the from_rl path because the from-side
+# is a redaction bookmark.
+log_must eval "zfs send -i $sendfs#redaction-bookmark $sendfs@s1 > $incr_stream"
+verify_xdr_nvlist_encoding $incr_stream
+log_must eval "zfs receive $recvfs < $incr_stream"
+
+log_pass "BEGIN nvlist of an incremental send from a redaction bookmark " \
+    "is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh
new file mode 100755
index 00000000000..fc4d34c4346
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh
@@ -0,0 +1,96 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# An incremental send whose from-side is a snapshot of a previously-redacted
+# dataset carries BEGINNV_REDACT_FROM_SNAPS in its DRR_BEGIN nvlist payload
+# via a different code path than incrementals from a redaction bookmark
+# (the dspp->numfromredactsnaps path). Verify that this payload is
+# XDR-encoded and that the stream can be received.
+#
+# Strategy:
+# 1. Produce a redacted dataset on a receiver via a redacted full send,
+#    leaving the receiver with a snapshot whose from-side will carry the
+#    SPA_FEATURE_REDACTED_DATASETS feature.
+# 2. Establish the same base on a tertiary destination so we have somewhere
+#    to apply the incremental.
+# 3. Create a new snapshot of the receiver-side redacted dataset.
+# 4. zfs send -i mid@s0 mid@s1 to a file.
+# 5. Verify that the stream is XDR encoded.
+# 6. Verify that we can zfs receive the incremental onto the tertiary base.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_incr_from_redacted_src"
+clonefs="$POOL/xdr_incr_from_redacted_clone"
+midfs="$POOL2/xdr_incr_from_redacted_mid"
+tertiary="$POOL/xdr_incr_from_redacted_tertiary"
+full_stream="/$POOL/xdr_incr_from_redacted_full.zsend"
+incr_stream="/$POOL/xdr_incr_from_redacted_incr.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $midfs && destroy_dataset $midfs -R
+	datasetexists $tertiary && destroy_dataset $tertiary -R
+	rm -f $full_stream $incr_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of an incremental from a previously-redacted " \
+    "snapshot is XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Produce two receivers of the redacted full send: one we will re-send from
+# (mid) and one we will receive the incremental into (tertiary).
+log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $midfs < $full_stream"
+log_must eval "zfs receive $tertiary < $full_stream"
+
+# Create a fresh snapshot of the redacted receiver. The data has not changed
+# (and cannot be modified without mounting), but the snapshot itself is
+# enough to drive an incremental send and trigger the case-4 nvlist path.
+log_must zfs snapshot $midfs@s1
+
+# Create an incremental send from the redacted from-side. This fires
+# BEGINNV_REDACT_FROM_SNAPS via the dspp->numfromredactsnaps path because
+# $midfs@s0 has the SPA_FEATURE_REDACTED_DATASETS feature active.
+log_must eval "zfs send -i $midfs@s0 $midfs@s1 > $incr_stream"
+verify_xdr_nvlist_encoding $incr_stream
+log_must eval "zfs receive $tertiary < $incr_stream"
+
+log_pass "BEGIN nvlist of an incremental from a previously-redacted snapshot " \
+    "is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh
new file mode 100755
index 00000000000..c3a196650c6
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh
@@ -0,0 +1,67 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A raw send of an encrypted dataset (zfs send -w) carries a "crypt_keydata"
+# nested nvlist in its DRR_BEGIN nvlist payload. Verify that this payload is
+# XDR-encoded and that the stream can be received.
+#
+# Strategy:
+# 1. Create an encrypted dataset with one snapshot.
+# 2. zfs send -w to a file.
+# 3. Verify that the stream is XDR-encoded.
+# 4. Verify that zfs receive succeeds.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_raw_src"
+recvfs="$POOL2/xdr_raw_recv"
+keyfile="/$POOL/xdr_raw.key"
+stream="/$POOL/xdr_raw.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -r
+	datasetexists $recvfs && destroy_dataset $recvfs -r
+	rm -f $keyfile $stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a raw send of an encrypted dataset is " \
+    "XDR-encoded and receivable"
+
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s1
+
+log_must eval "zfs send -w $sendfs@s1 > $stream"
+
+verify_xdr_nvlist_encoding $stream
+log_must eval "zfs receive $recvfs < $stream"
+
+log_pass "BEGIN nvlist of a raw send of an encrypted dataset is " \
+    "XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh
new file mode 100755
index 00000000000..2bad9bebdaa
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh
@@ -0,0 +1,72 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A redacted send (zfs send --redact <bookmark>) carries BEGINNV_REDACT_SNAPS
+# in its DRR_BEGIN nvlist payload. Verify that this payload is XDR-encoded and
+# the stream can be received.
+#
+# Strategy:
+# 1. Create a source dataset and a divergent clone.
+# 2. Create a redaction bookmark on the source snapshot relative to the
+#    clone snapshot.
+# 3. zfs send --redact <bookmark> sendfs@snap to a file.
+# 4. verify_xdr_nvlist_encoding on the stream.
+# 5. Verify that zfs receive succeeds.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_redacted_full_src"
+clonefs="$POOL/xdr_redacted_full_clone"
+recvfs="$POOL2/xdr_redacted_full_recv"
+stream="/$POOL/xdr_redacted_full.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a redacted send is XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $stream"
+verify_xdr_nvlist_encoding $stream
+log_must eval "zfs receive $recvfs < $stream"
+
+log_pass "BEGIN nvlist of a redacted send is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh
new file mode 100755
index 00000000000..a18b1f40594
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh
@@ -0,0 +1,84 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# Sending a snapshot from a previously-redacted dataset (one with the
+# SPA_FEATURE_REDACTED_DATASETS feature active, e.g., one that was received
+# from a redacted send) carries BEGINNV_REDACT_SNAPS in its DRR_BEGIN
+# nvlist payload via a different code path than the producer-side --redact
+# flag. Verify that this payload is XDR-encoded and that the stream can be
+# received.
+#
+# Strategy:
+# 1. Produce a redacted dataset on a receiver via a redacted full send.
+# 2. zfs send the received-redacted snapshot to a new dataset.
+# 3. Verify XDR encoding on the new stream.
+# 4. Verify that a zfs receive of the new stream succeeds.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_redacted_received_src"
+clonefs="$POOL/xdr_redacted_received_clone"
+midfs="$POOL2/xdr_redacted_received_mid"
+recvfs="$POOL2/xdr_redacted_received_recv"
+full_stream="/$POOL/xdr_redacted_received_full.zsend"
+resend_stream="/$POOL/xdr_redacted_received_resend.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $midfs && destroy_dataset $midfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $full_stream $resend_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a send from a previously-redacted dataset is " \
+    "XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Produce a previously-redacted dataset on the receiver.
+log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $midfs < $full_stream"
+
+# Send the received-redacted snapshot. This fires BEGINNV_REDACT_SNAPS via
+# the SPA_FEATURE_REDACTED_DATASETS code path on to_ds.
+log_must eval "zfs send $midfs@s0 > $resend_stream"
+verify_xdr_nvlist_encoding $resend_stream
+log_must eval "zfs receive $recvfs < $resend_stream"
+
+log_pass "BEGIN nvlist of a send from a previously-redacted dataset is " \
+    "XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh
new file mode 100755
index 00000000000..2efcba32b9f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# zfs send explicitly disallows the source-side combination of -w and
+# --redact. However, the same nvlist combination (BEGINNV_REDACT_SNAPS
+# together with crypt_keydata) can still be reached by:
+#
+#   1. Sending a redacted (non-raw) stream from an unencrypted source.
+#   2. Receiving it with receiver-side encryption.
+#   3. Re-sending the now-encrypted-and-redacted dataset with -w.
+#
+# The final stream's DRR_BEGIN nvlist contains both the redact-snaps array
+# (via the SPA_FEATURE_REDACTED_DATASETS code path on to_ds) and
+# crypt_keydata (via DMU_BACKUP_FEATURE_RAW). Verify that this combined
+# payload is XDR-encoded and that the stream can be received.
+#
+# Strategy:
+# 1. Create an unencrypted source dataset with a redaction bookmark.
+# 2. zfs send --redact <book> sendfs@snap to a file (no -w).
+# 3. zfs receive into a new dataset with -o encryption=on (receiver-side
+#    encryption).
+# 4. zfs send -w the received dataset to a second stream file.
+# 5. Verify that this second stream is XDR-encoded.
+# 6. Verify that the second stream can be zfs received successfully.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_redacted_received_raw_src"
+clonefs="$POOL/xdr_redacted_received_raw_clone"
+midfs="$POOL2/xdr_redacted_received_raw_mid"
+recvfs="$POOL2/xdr_redacted_received_raw_recv"
+keyfile="/$POOL/xdr_redacted_received_raw.key"
+full_stream="/$POOL/xdr_redacted_received_raw_full.zsend"
+resend_stream="/$POOL/xdr_redacted_received_raw_resend.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $midfs && destroy_dataset $midfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $keyfile $full_stream $resend_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a raw send of a received-redacted dataset is " \
+    "XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Redacted send (non-raw) into a receiver that establishes its own encryption.
+log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream"
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must eval "zfs receive -o encryption=on -o keyformat=passphrase " \
+    "-o keylocation=file://$keyfile $midfs < $full_stream"
+
+# Re-send the received stream as a raw (encrypted) stream. The DRR_BEGIN
+# nvlist now carries both BEGINNV_REDACT_SNAPS data and crypt_keydata
+# (DMU_BACKUP_FEATURE_RAW).
+log_must eval "zfs send -w $midfs@s0 > $resend_stream"
+verify_xdr_nvlist_encoding $resend_stream
+log_must eval "zfs receive $recvfs < $resend_stream"
+
+log_pass "BEGIN nvlist of a raw send of a received-redacted dataset is " \
+    "XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh
new file mode 100755
index 00000000000..22d0bf20410
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh
@@ -0,0 +1,90 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A replication send (zfs send -R) may emit two distinct categories of
+# DRR_BEGIN record:
+#
+#   1. A wrapper BEGIN of type DMU_COMPOUNDSTREAM, generated in libzfs
+#      (lib/libzfs/libzfs_sendrecv.c), whose nvlist describes the package
+#      stream. This BEGIN has always been XDR-encoded and is not affected
+#      by the kernel-side encoding changes introduced in PR #18372.
+#
+#   2. One inner BEGIN record per dataset whose contents are included, 
+#      generated in the kernel (module/zfs/dmu_send.c). These are the BEGIN
+#      records whose encoding the kernel-side change consolidates to XDR.
+#
+# All other tests in this suite exercise category (2). This test exercises
+# both categories together: it verifies that no BEGIN record produced
+# anywhere on the userspace+kernel send path is encoded with NV_ENCODE_NATIVE,
+# so a future regression in either layer would be caught.
+#
+# Strategy:
+# 1. Create an unencrypted parent dataset and an encrypted child filesystem
+#    underneath it, with some data in each. The encrypted child is what
+#    causes the kernel-side inner BEGIN to actually carry an nvlist payload
+#    (crypt_keydata) rather than passing through silently.
+# 2. Snapshot recursively.
+# 3. zfs send -wR parent@snap to a file. The resulting stream contains a
+#    libzfs-generated wrapper BEGIN with its compound-stream nvlist plus
+#    one kernel-generated inner BEGIN per dataset; the child's inner BEGIN
+#    carries crypt_keydata.
+# 4. Verify the encoding for the whole stream — this checks every BEGIN
+#    nvlist line that zstream dump emits, so it covers both the wrapper
+#    and the encrypted child's inner record.
+# 5. Verify that the stream can be zfs received successfully.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_replication_src"
+childfs="$POOL/xdr_replication_src/child"
+recvfs="$POOL2/xdr_replication_recv"
+keyfile="/$POOL/xdr_replication.key"
+stream="/$POOL/xdr_replication.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $keyfile $stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlists in a recursive replication stream (wrapper and inner) are XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $childfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=4 status=none
+log_must dd if=/dev/urandom of=/$childfs/f1 bs=128k count=4 status=none
+log_must zfs snapshot -r $sendfs@s0
+
+log_must eval "zfs send -wR $sendfs@s0 > $stream"
+verify_xdr_nvlist_encoding $stream
+log_must eval "zfs receive $recvfs < $stream"
+
+log_pass "BEGIN nvlists in a recursive replication stream (wrapper and inner) are XDR-encoded and receivable"
+
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh
new file mode 100755
index 00000000000..e98de4c47f4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh
@@ -0,0 +1,73 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A token-resumed send (zfs send -t <token>) carries BEGINNV_RESUME_OBJECT
+# and BEGINNV_RESUME_OFFSET in its DRR_BEGIN nvlist payload. Verify that
+# this payload is XDR-encoded and that the resumed stream can be received.
+#
+# Strategy:
+# 1. Create a small dataset with one snapshot.
+# 2. zfs send the snapshot to a file, truncate it, then attempt receive
+#    so that a resume token is left behind.
+# 3. zfs send -t <token> to produce the resumed stream.
+# 4. Verify that the resumed stream is XDR-encoded.
+# 5. Verify that zfs receive -s on the resumed stream is successful.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_resume_src"
+recvfs="$POOL2/xdr_resume_recv"
+full_stream="/$POOL/xdr_resume_full.zsend"
+resumed_stream="/$POOL/xdr_resume_resumed.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -r
+	datasetexists $recvfs && destroy_dataset $recvfs -r
+	rm -f $full_stream $resumed_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a token-resumed send is XDR-encoded and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none
+log_must zfs snapshot $sendfs@s1
+
+log_must eval "zfs send $sendfs@s1 > $full_stream"
+mess_send_file $full_stream
+log_mustnot eval "zfs receive -s $recvfs < $full_stream"
+
+token=$(get_prop receive_resume_token $recvfs)
+[[ -n "$token" && "$token" != "-" ]] || \
+    log_fail "no resume token left behind by partial receive"
+log_must eval "zfs send -t $token > $resumed_stream"
+
+verify_xdr_nvlist_encoding $resumed_stream
+log_must eval "zfs receive -s $recvfs < $resumed_stream"
+
+log_pass "BEGIN nvlist of a token-resumed send is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh
new file mode 100755
index 00000000000..6645315fcd7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh
@@ -0,0 +1,103 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# The most populated DRR_BEGIN nvlist in the kernel: a token-resumed raw
+# incremental from a redaction bookmark carries BEGINNV_REDACT_FROM_SNAPS,
+# crypt_keydata, and BEGINNV_RESUME_{OBJECT,OFFSET}. Verify that this
+# combined payload is XDR-encoded and the resumed stream can be received.
+#
+# Strategy:
+# 1. Create an encrypted source with a redaction bookmark and a later
+#    snapshot, mirroring xdr_bookmark_raw.
+# 2. Establish a raw base on the receiver.
+# 3. zfs send -w -i sendfs#book sendfs@s1 to a file, truncate it, then
+#    attempt receive so that a resume token is left behind.
+# 4. zfs send -t <token> to produce the resumed stream.
+# 5. Verify that the resumed stream is XDR-encoded.
+# 6. Verify that zfs receive -s of the resumed stream is successful.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_resume_bookmark_raw_src"
+clonefs="$POOL/xdr_resume_bookmark_raw_clone"
+recvfs="$POOL2/xdr_resume_bookmark_raw_recv"
+keyfile="/$POOL/xdr_resume_bookmark_raw.key"
+full_stream="/$POOL/xdr_resume_bookmark_raw_full.zsend"
+incr_stream="/$POOL/xdr_resume_bookmark_raw_incr.zsend"
+resumed_stream="/$POOL/xdr_resume_bookmark_raw_resumed.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $keyfile $full_stream $incr_stream $resumed_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a token-resumed raw incremental from a redaction " \
+    "bookmark is XDR-encoded and receivable"
+
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $sendfs
+
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=16 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=16 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Take @s1 with no intervening write. See xdr_resume_bookmark_raw_with_write.ksh
+# for a variant that includes a post-redact write; that variant exercises
+# a known kernel-side issue (#18491) and may flake.
+log_must zfs snapshot $sendfs@s1
+
+# Establish a raw base on the receiver.
+log_must eval "zfs send -w $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $recvfs < $full_stream"
+
+# Truncate-and-resume on the raw incremental from the redaction bookmark.
+log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \
+    $incr_stream"
+mess_send_file $incr_stream
+log_mustnot eval "zfs receive -s $recvfs < $incr_stream"
+
+token=$(get_prop receive_resume_token $recvfs)
+[[ -n "$token" && "$token" != "-" ]] || \
+    log_fail "no resume token left behind by partial receive"
+log_must eval "zfs send -t $token > $resumed_stream"
+
+verify_xdr_nvlist_encoding $resumed_stream
+log_must eval "zfs receive -s $recvfs < $resumed_stream"
+
+log_pass "BEGIN nvlist of a token-resumed raw incremental from a redaction " \
+    "bookmark is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh
new file mode 100755
index 00000000000..6c0b6b5b4ec
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh
@@ -0,0 +1,116 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# This is the post-redact-write variant of xdr_resume_bookmark_raw,
+# separated out because of a known issue (#18491) that causes it to fail
+# roughly 30% of the time. It's included here as a test for issue #18491
+# until the exact source of that problem can be pinned down more specifically.
+#
+# Known issue: openzfs/zfs#18491
+#
+# On a freshly-created pool, `zfs send -w -i ds#book ds@snap` intermittently
+# fails with EACCES whenever there is data-modifying activity between the
+# `zfs redact` that created the bookmark and the subsequent send. This EACCES
+# is surfaced to userspace as the misleading message "dataset key must be
+# loaded," although the key remains loaded throughout.
+#
+# The reproducer script included in the issue report typically triggers the
+# problem within about 10 iterations on a fresh pool. Disk-sync mitigations
+# (zpool sync, with or without `-f`, with or without sleep, single or doubled,
+# applied at any reasonable point) do not avert the problem. CI runs that
+# include the test in this file reproduce the failure regularly (though
+# intermittently) across multiple distributions. xdr_resume_bookmark_raw.ksh
+# removes the post-redact write (which is not essential to the test) and
+# therefore runs reliably.
+#
+# When this test fails, the failure marker is the libzfs warning
+# "dataset key must be loaded" on stderr from the first `zfs send -w -i`
+# line below (the one that produces the stream we then truncate), and a
+# non-zero exit from that send. The test does not attempt to distinguish
+# the known-issue failure from other possible failures.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_resume_bookmark_raw_with_write_src"
+clonefs="$POOL/xdr_resume_bookmark_raw_with_write_clone"
+recvfs="$POOL2/xdr_resume_bookmark_raw_with_write_recv"
+keyfile="/$POOL/xdr_resume_bookmark_raw_with_write.key"
+full_stream="/$POOL/xdr_resume_bookmark_raw_with_write_full.zsend"
+incr_stream="/$POOL/xdr_resume_bookmark_raw_with_write_incr.zsend"
+resumed_stream="/$POOL/xdr_resume_bookmark_raw_with_write_resumed.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $keyfile $full_stream $incr_stream $resumed_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a token-resumed raw incremental from a redaction " \
+    "bookmark, with a post-redact write, is XDR-encoded and receivable " \
+    "(known to flake; see openzfs/zfs#18491)"
+
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $sendfs
+
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=16 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+# Post-redact write: the trigger for openzfs/zfs#18491.
+log_must dd if=/dev/urandom of=/$sendfs/f3 bs=128k count=16 status=none
+log_must zfs snapshot $sendfs@s1
+
+# Establish a raw base on the receiver.
+log_must eval "zfs send -w $sendfs@s0 > $full_stream"
+log_must eval "zfs receive $recvfs < $full_stream"
+
+# The next line is what races. On failure it exits with EACCES rendered
+# as "dataset key must be loaded".
+log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \
+    $incr_stream"
+mess_send_file $incr_stream
+log_mustnot eval "zfs receive -s $recvfs < $incr_stream"
+
+token=$(get_prop receive_resume_token $recvfs)
+[[ -n "$token" && "$token" != "-" ]] || \
+    log_fail "no resume token left behind by partial receive"
+log_must eval "zfs send -t $token > $resumed_stream"
+
+verify_xdr_nvlist_encoding $resumed_stream
+log_must eval "zfs receive -s $recvfs < $resumed_stream"
+
+log_pass "BEGIN nvlist of a token-resumed raw incremental from a redaction " \
+    "bookmark, with a post-redact write, is XDR-encoded and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh
new file mode 100755
index 00000000000..a96df10b945
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh
@@ -0,0 +1,79 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A resumed raw send (zfs send -t <token> for a raw stream of an encrypted
+# dataset) carries both BEGINNV_RESUME_{OBJECT,OFFSET} and the "crypt_keydata"
+# nested nvlist in its DRR_BEGIN nvlist payload. Verify that this combined
+# payload is XDR-encoded and the resumed stream can be received.
+#
+# Strategy:
+# 1. Create an encrypted dataset with one snapshot.
+# 2. zfs send -w to a file, truncate it, then attempt to zfs receive the
+#    stream so that a resume token is left behind.
+# 3. zfs send -t <token> to produce the resumed raw stream.
+# 4. Verify that the resumed stream is XDR-encoded.
+# 5. Verify that zfs receive -s receives the resumed stream successfully.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_resume_raw_src"
+recvfs="$POOL2/xdr_resume_raw_recv"
+keyfile="/$POOL/xdr_resume_raw.key"
+full_stream="/$POOL/xdr_resume_raw_full.zsend"
+resumed_stream="/$POOL/xdr_resume_raw_resumed.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -r
+	datasetexists $recvfs && destroy_dataset $recvfs -r
+	rm -f $keyfile $full_stream $resumed_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a token-resumed raw send is XDR-encoded " \
+    "and receivable"
+
+log_must eval "echo 'thisisapassphrase' > $keyfile"
+log_must zfs create -o encryption=on -o keyformat=passphrase \
+    -o keylocation=file://$keyfile $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none
+log_must zfs snapshot $sendfs@s1
+
+log_must eval "zfs send -w $sendfs@s1 > $full_stream"
+mess_send_file $full_stream
+log_mustnot eval "zfs receive -s $recvfs < $full_stream"
+
+token=$(get_prop receive_resume_token $recvfs)
+[[ -n "$token" && "$token" != "-" ]] || \
+    log_fail "no resume token left behind by partial receive"
+log_must eval "zfs send -t $token > $resumed_stream"
+
+verify_xdr_nvlist_encoding $resumed_stream
+log_must eval "zfs receive -s $recvfs < $resumed_stream"
+
+log_pass "BEGIN nvlist of a token-resumed raw send is XDR-encoded " \
+    "and receivable"
diff --git a/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh
new file mode 100755
index 00000000000..6cee3e51a3d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh
@@ -0,0 +1,86 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Garth Snyder. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib
+
+#
+# Description:
+# A resumed redacted send (zfs send -t <token> for a redacted stream)
+# carries both BEGINNV_REDACT_SNAPS and BEGINNV_RESUME_{OBJECT,OFFSET} in
+# its DRR_BEGIN nvlist payload. Verify that this combined payload is
+# XDR-encoded and the resumed stream can be received.
+#
+# Strategy:
+# 1. Create a source dataset with a redaction bookmark.
+# 2. zfs send --redact <book> sendfs@snap to a file, truncate it, then
+#    attempt zfs receive so that a resume token is left behind.
+# 3. zfs send -t <token> to produce a resumed redacted stream.
+# 4. Verify that the resumed stream is XDR-encoded.
+# 5. Verify that zfs receive -s of the resumed stream is successful.
+#
+
+verify_runnable "both"
+
+sendfs="$POOL/xdr_resume_redacted_src"
+clonefs="$POOL/xdr_resume_redacted_clone"
+recvfs="$POOL2/xdr_resume_redacted_recv"
+full_stream="/$POOL/xdr_resume_redacted_full.zsend"
+resumed_stream="/$POOL/xdr_resume_redacted_resumed.zsend"
+
+function cleanup
+{
+	datasetexists $sendfs && destroy_dataset $sendfs -R
+	datasetexists $recvfs && destroy_dataset $recvfs -R
+	rm -f $full_stream $resumed_stream
+}
+log_onexit cleanup
+
+log_assert "BEGIN nvlist of a token-resumed redacted send is XDR-encoded " \
+    "and receivable"
+
+log_must zfs create $sendfs
+log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none
+log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=16 status=none
+log_must zfs snapshot $sendfs@s0
+
+log_must zfs clone $sendfs@s0 $clonefs
+log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=16 conv=notrunc \
+    status=none
+log_must zfs snapshot $clonefs@s
+
+log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s
+
+log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream"
+mess_send_file $full_stream
+log_mustnot eval "zfs receive -s $recvfs < $full_stream"
+
+token=$(get_prop receive_resume_token $recvfs)
+[[ -n "$token" && "$token" != "-" ]] || \
+    log_fail "no resume token left behind by partial receive"
+log_must eval "zfs send -t $token > $resumed_stream"
+
+verify_xdr_nvlist_encoding $resumed_stream
+log_must eval "zfs receive -s $recvfs < $resumed_stream"
+
+log_pass "BEGIN nvlist of a token-resumed redacted send is XDR-encoded " \
+    "and receivable"
+

From 59e10e7b928371d08cb9609b8c0bca864319aaeb Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 12 May 2026 16:50:33 +0200
Subject: [PATCH 020/129] libzfs_pool: document export and initialize functions

Add brief docstrings to zpool_export(), zpool_export_force(),
zpool_initialize() and zpool_initialize_wait().

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18514
---
 lib/libzfs/libzfs_pool.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index fd957d98313..7c4c081edb4 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2031,12 +2031,21 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
 	return (0);
 }
 
+/*
+ * Export the pool from the system.  Setting force overrides the
+ * active-shared-spare check.  The caller must unmount all datasets
+ * in the pool first.
+ */
 int
 zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str)
 {
 	return (zpool_export_common(zhp, force, B_FALSE, log_str));
 }
 
+/*
+ * Force-export the pool: bypasses the active-shared-spare check, and skips
+ * writing the exported-state labels and updating the cachefile.
+ */
 int
 zpool_export_force(zpool_handle_t *zhp, const char *log_str)
 {
@@ -2685,6 +2694,10 @@ zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
 	return (err == 0 ? 0 : -1);
 }
 
+/*
+ * Start (or cancel/suspend/uninit) the initialize operation on the listed
+ * vdevs.  Returns once the new state is committed.
+ */
 int
 zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
     nvlist_t *vds)
@@ -2692,6 +2705,9 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
 	return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE));
 }
 
+/*
+ * Like zpool_initialize(), but waits for each listed vdev to finish.
+ */
 int
 zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
     nvlist_t *vds)

From 90a174038eada46925f343fe67e7e978a8990865 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 12 May 2026 11:54:47 -0700
Subject: [PATCH 021/129] CI: FreeBSD 15.1 STABLE

Update the freebsd15-1s builder to the released STABLE image.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18524
---
 .github/workflows/scripts/qemu-2-start.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index e63aece389c..3c1f456ed0c 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -131,7 +131,7 @@ case "$OS" in
     KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
     ;;
   freebsd15-1s)
-    FreeBSD="15.1-PRERELEASE"
+    FreeBSD="15.1-STABLE"
     OSNAME="FreeBSD $FreeBSD"
     OSv="freebsd14.0"
     URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"

From 414ce4b5fc59a0bb214895e59af48ecebca0672c Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 12 May 2026 23:23:12 +0200
Subject: [PATCH 022/129] Linux: expose zfs_arc_no_grow_shift as a module
 parameter

The zfs_arc_no_grow_shift variable is tunable via sysctl on FreeBSD
but had no module parameter registration on Linux.

Register it once in arc.c using param_get_uint and a per-platform
set handler, replacing the FreeBSD-only registration.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alek Pinchuk <alek.pinchuk@connectwise.com>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18461
---
 include/os/freebsd/zfs/sys/arc_os.h |  1 -
 include/sys/arc_impl.h              |  3 ++-
 man/man4/zfs.4                      |  2 --
 module/os/freebsd/zfs/arc_os.c      |  3 ---
 module/os/freebsd/zfs/sysctl_os.c   |  4 ++--
 module/os/linux/zfs/arc_os.c        | 18 ++++++++++++++++++
 module/zfs/arc.c                    | 13 +++++++++----
 7 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/include/os/freebsd/zfs/sys/arc_os.h b/include/os/freebsd/zfs/sys/arc_os.h
index ad2aba23b90..6334d453f48 100644
--- a/include/os/freebsd/zfs/sys/arc_os.h
+++ b/include/os/freebsd/zfs/sys/arc_os.h
@@ -29,6 +29,5 @@
 #define	_SYS_ARC_OS_H
 
 int param_set_arc_free_target(SYSCTL_HANDLER_ARGS);
-int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
 
 #endif
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 8fde5c4fe50..7fbf5cee4fa 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -1105,7 +1105,7 @@ extern arc_sums_t arc_sums;
 extern hrtime_t arc_growtime;
 extern boolean_t arc_warm;
 extern uint_t arc_grow_retry;
-extern uint_t arc_no_grow_shift;
+extern uint_t zfs_arc_no_grow_shift;
 extern uint_t arc_shrink_shift;
 extern kmutex_t arc_prune_mtx;
 extern list_t arc_prune_list;
@@ -1136,6 +1136,7 @@ extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
 extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
 extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
 extern int param_set_l2arc_dwpd_limit(ZFS_MODULE_PARAM_ARGS);
+extern int param_set_arc_no_grow_shift(ZFS_MODULE_PARAM_ARGS);
 extern void l2arc_dwpd_bump_reset(void);
 
 /* used in zdb.c */
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 29fdbd3eb44..11b6c622f8e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -955,8 +955,6 @@ equivalent to the greater of the number of online CPUs and
 If less than
 .Sy arc_c No >> Sy zfs_arc_no_grow_shift
 free memory is available, the ARC is not allowed to grow.
-This parameter is
-.Fx Ns -specific .
 .
 .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int
 The ARC size is considered to be overflowing if it exceeds the current
diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 02a2870c02b..7cb390cab23 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -72,9 +72,6 @@ SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target,
     param_set_arc_free_target, 0, CTLFLAG_RW,
 	"Desired number of free pages below which ARC triggers reclaim");
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift,
-    param_set_arc_no_grow_shift, 0, ZMOD_RW,
-	"log2(fraction of ARC which must be free to allow growing)");
 
 int64_t
 arc_available_memory(void)
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index a06e00d7373..934055da88d 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -256,7 +256,7 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
-	val = arc_no_grow_shift;
+	val = zfs_arc_no_grow_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
@@ -264,7 +264,7 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 	if (val < 0 || val >= arc_shrink_shift)
 		return (EINVAL);
 
-	arc_no_grow_shift = val;
+	zfs_arc_no_grow_shift = val;
 
 	return (0);
 }
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index dbc9aad936b..05f4fb51b4b 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -410,6 +410,24 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
 	return (0);
 }
 
+int
+param_set_arc_no_grow_shift(const char *buf, zfs_kernel_param_t *kp)
+{
+	unsigned long val;
+	int error;
+
+	error = kstrtoul(buf, 0, &val);
+	if (error)
+		return (SET_ERROR(error));
+
+	if (val >= arc_shrink_shift)
+		return (-SET_ERROR(EINVAL));
+
+	zfs_arc_no_grow_shift = val;
+
+	return (0);
+}
+
 int
 param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp)
 {
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 22b189d5bb8..b2dba3e0ff9 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -398,14 +398,14 @@ uint_t zfs_arc_pc_percent = 0;
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
- * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * I.e. If there is less than arc_c >> zfs_arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
-uint_t		arc_no_grow_shift = 5;
+uint_t		zfs_arc_no_grow_shift = 5;
 
 
 /*
@@ -4976,7 +4976,7 @@ arc_reap_cb_check(void *arg, zthr_t *zthr)
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
-	} else if (free_memory < arc_c >> arc_no_grow_shift) {
+	} else if (free_memory < arc_c >> zfs_arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
@@ -7656,7 +7656,8 @@ arc_tuning_update(boolean_t verbose)
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
-		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
+		zfs_arc_no_grow_shift = MIN(zfs_arc_no_grow_shift,
+		    arc_shrink_shift - 1);
 	}
 
 	/* Valid range: 1 - N ms */
@@ -11703,6 +11704,10 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
 
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift,
+	param_set_arc_no_grow_shift, param_get_uint, ZMOD_RW,
+	"log2(fraction of ARC which must be free to allow growing)");
+
 #ifdef _KERNEL
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim ARC to");

From 532760e19777a6be179b4c96e9af1678832dbd9e Mon Sep 17 00:00:00 2001
From: Gality <68463495+Gality369@users.noreply.github.com>
Date: Wed, 13 May 2026 05:23:57 +0800
Subject: [PATCH 023/129] Linux: avoid znode list lock inversion during resume

Lockdep reports a circular locking dependency during mounted filesystem
rollback.  zfs_resume_fs() walks z_all_znodes under z_znodes_lock and
calls zfs_rezget(), which takes the per-object znode hold lock via
zfs_znode_hold_enter().

The normal zget path takes these locks in the opposite order.
zfs_zget() takes the per-object hold lock before zfs_znode_alloc()
inserts the znode on z_all_znodes under z_znodes_lock.  Resume can
therefore establish z_znodes_lock -> zh_lock while normal lookup
creates zh_lock -> z_znodes_lock.

Pin the current and next znodes with igrab() while holding the list
lock, then drop the list lock before reloading the znode.  Existing
stale inode handling is preserved, and both the suspended reference
and temporary walk reference are released asynchronously.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
Closes #18517
---
 module/os/linux/zfs/zfs_vfsops.c | 45 +++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index d7b50242992..27f3bbb46f4 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -1689,6 +1689,24 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs)
 	return (0);
 }
 
+/*
+ * Return a referenced znode at or after zp.  The z_znodes_lock protects the
+ * list walk; the returned inode reference keeps the znode alive after the
+ * lock is dropped for zfs_rezget().
+ */
+static znode_t *
+zfs_resume_hold_next_znode(zfsvfs_t *zfsvfs, znode_t *zp)
+{
+	ASSERT(MUTEX_HELD(&zfsvfs->z_znodes_lock));
+
+	for (; zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		if (igrab(ZTOI(zp)) != NULL)
+			return (zp);
+	}
+
+	return (NULL);
+}
+
 /*
  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
  * is an invariant across any of the operations that can be performed while the
@@ -1732,13 +1750,23 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 	 * dbufs.  If a zfs_rezget() fails, then we unhash the inode
 	 * and mark it stale.  This prevents a collision if a new
 	 * inode/object is created which must use the same inode
-	 * number.  The stale inode will be be released when the
-	 * VFS prunes the dentry holding the remaining references
-	 * on the stale inode.
+	 * number.  The stale inode will be released when the VFS
+	 * prunes the dentry holding the remaining references on
+	 * the stale inode.
+	 *
+	 * zfs_rezget() takes the per-object znode hold lock.  Pin each znode
+	 * while holding z_znodes_lock, then drop the list lock before calling
+	 * zfs_rezget() to preserve the normal zh_lock -> z_znodes_lock order.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
-	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
-	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+	zp = zfs_resume_hold_next_znode(zfsvfs,
+	    list_head(&zfsvfs->z_all_znodes));
+	while (zp != NULL) {
+		znode_t *next = zfs_resume_hold_next_znode(zfsvfs,
+		    list_next(&zfsvfs->z_all_znodes, zp));
+
+		mutex_exit(&zfsvfs->z_znodes_lock);
+
 		err2 = zfs_rezget(zp);
 		if (err2) {
 			zpl_d_drop_aliases(ZTOI(zp));
@@ -1747,9 +1775,14 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 
 		/* see comment in zfs_suspend_fs() */
 		if (zp->z_suspended) {
-			zfs_zrele_async(zp);
 			zp->z_suspended = B_FALSE;
+			zfs_zrele_async(zp);
 		}
+
+		zfs_zrele_async(zp);
+
+		mutex_enter(&zfsvfs->z_znodes_lock);
+		zp = next;
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 

From 58c8dc5f6926eb96903a3f38b141e8998ef9261b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 00:37:53 +1000
Subject: [PATCH 024/129] linux/zpl_super: handle 'source' option directly

vfs_parse_fs_param_source() didn't appear until 5.14, and was not
backported to kernel.org LTS kernels. It's simple enough that it's
easier to just handle it ourselves rather than use a configure check.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18529
---
 module/os/linux/zfs/zpl_super.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index 2cd0f17c860..c1460edd16d 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -550,10 +550,11 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
  *
  * Finally, all filesystems get automatic handling for the 'source' option,
  * that is, the "name" of the filesystem (the first column of df(1)'s output).
- * However, this only happens if the handler does not otherwise handle
- * the 'source' option. Since we handle _all_ options because of 'sloppy', we
- * deal with this explicitly by calling into the kernel's helper for this,
- * vfs_parse_fs_param_source(), which sets up fc->source.
+ * However, this only happens if the handler does not otherwise handle the
+ * 'source' option. Since we handle _all_ options because of 'sloppy', we have
+ * ot handle it ourselves. Normally we would call vfs_parse_fs_param_source()
+ * to deal with this, but that didn't appear until 5.14, and it's small enough
+ * that we can just handle it ourselves.
  *
  *	source
  *
@@ -565,6 +566,7 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
  */
 
 enum {
+	Opt_source,
 	Opt_exec, Opt_suid, Opt_dev,
 	Opt_atime, Opt_relatime, Opt_strictatime,
 	Opt_saxattr, Opt_dirxattr, Opt_noxattr,
@@ -574,6 +576,8 @@ enum {
 };
 
 static const struct fs_parameter_spec zpl_param_spec[] = {
+	fsparam_string("source",	Opt_source),
+
 	fsparam_flag_no("exec",		Opt_exec),
 	fsparam_flag_no("suid",		Opt_suid),
 	fsparam_flag_no("dev",		Opt_dev),
@@ -614,13 +618,8 @@ zpl_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	vfs_t *vfs = fc->fs_private;
 
-	/* Handle 'source' explicitly so we don't trip on it as an unknown. */
-	int opt = vfs_parse_fs_param_source(fc, param);
-	if (opt != -ENOPARAM)
-		return (opt);
-
 	struct fs_parse_result result;
-	opt = fs_parse(fc, zpl_param_spec, param, &result);
+	int opt = fs_parse(fc, zpl_param_spec, param, &result);
 	if (opt == -ENOPARAM) {
 		/*
 		 * Convert unknowns to warnings, to work around the whole
@@ -632,6 +631,16 @@ zpl_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		return (opt);
 
 	switch (opt) {
+	case Opt_source:
+		if (fc->source != NULL) {
+			cmn_err(CE_NOTE,
+			    "ZFS: multiple 'source' options not supported");
+			return (-SET_ERROR(EINVAL));
+		}
+		fc->source = param->string;
+		param->string = NULL;
+		break;
+
 	case Opt_exec:
 		vfs->vfs_exec = !result.negated;
 		vfs->vfs_do_exec = B_TRUE;

From 181e1b52276ae29997902faf886d8298a77b39f8 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Mon, 11 May 2026 16:26:09 -0400
Subject: [PATCH 025/129] Fix double free for blocks cloned after DDT prune

Before this change, for blocks marked with D flag but absent in DDT
(pruned from it), zio_ddt_free() fell back to ZIO_STAGE_DVA_FREE
without trying ZIO_STAGE_BRT_FREE first.  Same time such blocks
might be present in BRT, and not handling that would result in
double/multiple free.

This change makes ZIO_DDT_FREE_PIPELINE include ZIO_FREE_PIPELINE,
just adding required ZIO_STAGE_ISSUE_ASYNC and ZIO_STAGE_DDT_FREE,
and moves DDT stages before BRT.  This way, if the block is found
in DDT by zio_ddt_free(), the pipeline is short-circuited to
ZIO_INTERLOCK_PIPELINE, similar to what zio_brt_free() does.  If
not, then BRT is checked, and if also no match, the block is freed.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@truenas.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18520
---
 include/sys/zio_impl.h                        |  13 +-
 man/man8/zpool-events.8                       |  10 +-
 module/zcommon/zfs_valstr.c                   |   2 +-
 module/zfs/zio.c                              |  23 ++-
 tests/runfiles/common.run                     |   8 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../functional/dedup/dedup_bclone_pruned.ksh  | 152 ++++++++++++++++++
 7 files changed, 184 insertions(+), 25 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh

diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index 42147adaf1a..62e7e27da38 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -139,12 +139,12 @@ enum zio_stage {
 
 	ZIO_STAGE_NOP_WRITE		= 1 << 8,	/* -W---- */
 
-	ZIO_STAGE_BRT_FREE		= 1 << 9,	/* --F--- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 9,	/* R----- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 10,	/* R----- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 11,	/* -W---- */
+	ZIO_STAGE_DDT_FREE		= 1 << 12,	/* --F--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 10,	/* R----- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 11,	/* R----- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 12,	/* -W---- */
-	ZIO_STAGE_DDT_FREE		= 1 << 13,	/* --F--- */
+	ZIO_STAGE_BRT_FREE		= 1 << 13,	/* --F--- */
 
 	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 14,	/* RWFC-- */
 	ZIO_STAGE_GANG_ISSUE		= 1 << 15,	/* RWFC-- */
@@ -259,8 +259,7 @@ enum zio_stage {
 	ZIO_STAGE_DVA_FREE)
 
 #define	ZIO_DDT_FREE_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_FREE_BP_INIT |		\
+	(ZIO_FREE_PIPELINE |			\
 	ZIO_STAGE_ISSUE_ASYNC |			\
 	ZIO_STAGE_DDT_FREE)
 
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index 3753139bdfe..12a11058072 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -458,12 +458,12 @@ ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W----
 
 ZIO_STAGE_NOP_WRITE:0x00000100:-W----
 
-ZIO_STAGE_BRT_FREE:0x00000200:--F---
+ZIO_STAGE_DDT_READ_START:0x00000200:R-----
+ZIO_STAGE_DDT_READ_DONE:0x00000400:R-----
+ZIO_STAGE_DDT_WRITE:0x00000800:-W----
+ZIO_STAGE_DDT_FREE:0x00001000:--F---
 
-ZIO_STAGE_DDT_READ_START:0x00000400:R-----
-ZIO_STAGE_DDT_READ_DONE:0x00000800:R-----
-ZIO_STAGE_DDT_WRITE:0x00001000:-W----
-ZIO_STAGE_DDT_FREE:0x00002000:--F---
+ZIO_STAGE_BRT_FREE:0x00002000:--F---
 
 ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC--
 ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC--
diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c
index 0cb9f584acc..41a2313e575 100644
--- a/module/zcommon/zfs_valstr.c
+++ b/module/zcommon/zfs_valstr.c
@@ -238,11 +238,11 @@ _VALSTR_BITFIELD_IMPL(zio_stage,
 	{ 'E', "EN", "ENCRYPT" },
 	{ 'C', "CG", "CHECKSUM_GENERATE" },
 	{ 'N', "NW", "NOP_WRITE" },
-	{ 'B', "BF", "BRT_FREE" },
 	{ 'd', "dS", "DDT_READ_START" },
 	{ 'd', "dD", "DDT_READ_DONE" },
 	{ 'd', "dW", "DDT_WRITE" },
 	{ 'd', "dF", "DDT_FREE" },
+	{ 'B', "BF", "BRT_FREE" },
 	{ 'G', "GA", "GANG_ASSEMBLE" },
 	{ 'G', "GI", "GANG_ISSUE" },
 	{ 'D', "DT", "DVA_THROTTLE" },
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 94b44561bd9..3e95103385c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4168,14 +4168,21 @@ zio_ddt_free(zio_t *zio)
 	}
 	ddt_exit(ddt);
 
-	/*
-	 * When no entry was found, it must have been pruned,
-	 * so we can free it now instead of decrementing the
-	 * refcount in the DDT.
-	 */
-	if (!dde) {
+	if (dde) {
+		/*
+		 * DDT entry found and the refcount has been decremented.
+		 * Stop the pipeline — there is nothing more to do right now.
+		 */
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	} else {
+		/*
+		 * No DDT entry; the block must have been pruned from the
+		 * table.  Clear the DEDUP bit so it is treated as a normal
+		 * block from here on.  BRT_FREE and DVA_FREE follow in the
+		 * pipeline and will handle any cloned references and the
+		 * actual block free respectively.
+		 */
 		BP_SET_DEDUP(bp, 0);
-		zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
 	}
 
 	return (zio);
@@ -5925,11 +5932,11 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
-	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
+	zio_brt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index f18835da74b..fbce8c8db65 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -717,10 +717,10 @@ post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
-tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing',
-    'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
-    'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_prune_leak',
-    'dedup_zap_shrink']
+tests = ['dedup_bclone_pruned', 'dedup_fdt_create', 'dedup_fdt_import',
+    'dedup_fdt_pacing', 'dedup_legacy_create', 'dedup_legacy_import',
+    'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota',
+    'dedup_prune', 'dedup_prune_leak', 'dedup_zap_shrink']
 pre =
 post =
 tags = ['functional', 'dedup']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 5dd350ece7c..7c8dbfe5fcd 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1503,6 +1503,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
+	functional/dedup/dedup_bclone_pruned.ksh \
 	functional/dedup/dedup_fdt_create.ksh \
 	functional/dedup/dedup_fdt_import.ksh \
 	functional/dedup/dedup_fdt_pacing.ksh \
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh
new file mode 100755
index 00000000000..d01d09ac12e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh
@@ -0,0 +1,152 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+#
+# DESCRIPTION:
+#	Verify that block cloning works correctly when the DDT entry for a
+#	dedup block has been pruned.  When a block has the DEDUP bit set but
+#	no DDT entry (because it was pruned), cloning it must create a BRT
+#	entry to track the extra reference.  Freeing the original must
+#	consult the BRT rather than proceeding directly to a DVA free,
+#	otherwise the block is freed while the clone still references it.
+#
+# STRATEGY:
+#	1. Create a pool with both dedup and block_cloning enabled
+#	2. Write a file with dedup=on so blocks get DEDUP bit set in their BPs
+#	3. Prune the DDT to remove those entries (blocks remain, DEDUP bit
+#	   stays set in block pointers)
+#	4. Clone the file - brt_pending_apply_vdev() must fall back to BRT
+#	   since ddt_addref() returns B_FALSE for pruned entries
+#	5. Write a second copy via dd - same hash, new physical blocks, new
+#	   DDT entries at different DVAs from the BRT-tracked blocks
+#	6. Delete the clone first - must go through BRT, not DDT, even though
+#	   a matching DDT entry now exists for the same hash
+#	7. Delete the dd copy - DDT entries freed normally
+#	8. Delete the original - no DDT entry, no BRT entry, DVA freed
+#	9. Verify reference counts with zdb -b at each step
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_assert "Block cloning of dedup blocks with pruned DDT entries uses BRT"
+
+# Flush DDT log every TXG so entries appear in the ZAP immediately,
+# making ddtprune effective and test behavior predictable.
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
+log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000
+
+function cleanup
+{
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
+	log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
+}
+
+log_onexit cleanup
+
+log_must zpool create -f -o feature@block_cloning=enabled $TESTPOOL $DISKS
+
+log_must zfs create -o dedup=sha256 -o recordsize=128k $TESTPOOL/$TESTFS
+typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+
+# Write unique data: each block gets a DDT entry with refcnt=1.
+log_must dd if=/dev/urandom of=$mountpoint/file1 bs=128k count=8
+
+sync_pool $TESTPOOL
+
+# Verify DDT has entries before pruning.
+typeset entries=$(zpool status -D $TESTPOOL | \
+    grep "dedup: DDT entries" | awk '{print $4}')
+log_must test "$entries" -eq 8
+
+# Sleep 1s so the DDT entries are at least 1 second old.  ddtprune uses
+# an age-based cutoff and will silently skip entries that are too fresh.
+sleep 1
+
+# Prune all unique (refcnt=1) entries.  The blocks remain on disk and the
+# block pointers in file1 still have the DEDUP bit set, but there is no
+# longer a DDT entry for them.
+log_must zpool ddtprune -p 100 $TESTPOOL
+sync_pool $TESTPOOL
+
+# Confirm the prune actually removed all entries.
+entries=$(zpool status -D $TESTPOOL | \
+    grep "dedup: DDT entries" | awk '{print $4}')
+[[ -z "$entries" || "$entries" -eq 0 ]] || \
+    log_fail "DDT entries not pruned: $entries remain"
+
+# Clone file1.  brt_pending_apply_vdev() will see the DEDUP bit, call
+# ddt_addref(), receive B_FALSE (no DDT entry), and fall through to
+# create BRT entries instead.
+log_must clonefile -f $mountpoint/file1 $mountpoint/clone1
+sync_pool $TESTPOOL
+
+# BRT entries exist; reference counts must be consistent.
+log_must zdb -b $TESTPOOL
+
+# Write a second copy via dd.  Since the DDT was pruned, dedup can't find
+# an existing entry and writes new physical blocks at new DVAs, creating
+# fresh DDT entries with refcnt=1.  The BRT-tracked blocks (file1/clone1)
+# are at the old DVAs and are unaffected.
+log_must dd if=$mountpoint/file1 of=$mountpoint/file2 bs=128k
+sync_pool $TESTPOOL
+
+# Eight new unique DDT entries (file2's blocks); BRT still holds refs for
+# file1/clone1's old blocks.
+typeset entries=$(zpool status -D $TESTPOOL | \
+    grep "dedup: DDT entries" | awk '{print $4}')
+log_must test "$entries" -eq 8
+log_must zdb -b $TESTPOOL
+
+# Delete the clone first.  Its blocks carry the DEDUP bit and the same
+# hash as file2's DDT entries, but the DVAs differ — the free must go
+# through BRT, not DDT, leaving file2's DDT entries intact.
+log_must rm $mountpoint/clone1
+sync_pool $TESTPOOL
+
+entries=$(zpool status -D $TESTPOOL | \
+    grep "dedup: DDT entries" | awk '{print $4}')
+log_must test "$entries" -eq 8
+log_must zdb -b $TESTPOOL
+
+# Delete file2.  DDT entries freed; file1's BRT-tracked blocks unaffected.
+log_must rm $mountpoint/file2
+sync_pool $TESTPOOL
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+log_must zdb -b $TESTPOOL
+
+# Delete the original.  No DDT entry, no BRT entry; DVA freed directly.
+log_must rm $mountpoint/file1
+sync_pool $TESTPOOL
+log_must zdb -b $TESTPOOL
+
+log_pass "Block cloning of dedup blocks with pruned DDT entries uses BRT"

From f5733f6fa3bd592556ebd52e88e2d5d650b20185 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Tue, 12 May 2026 10:04:22 -0400
Subject: [PATCH 026/129] Integrate DDT and BRT tests

Don't disable block cloning during dedup tests.  Just don't use
cp to not trigger it.  Add a new test, explicitly mixing dedup
and cloning on the same file, that should be handled by DDT.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@truenas.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18520
---
 tests/runfiles/common.run                     |   3 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../tests/functional/dedup/dedup_bclone.ksh   | 120 ++++++++++++++++++
 .../functional/dedup/dedup_fdt_create.ksh     |   6 +-
 .../functional/dedup/dedup_fdt_import.ksh     |   4 +-
 .../functional/dedup/dedup_fdt_pacing.ksh     |   4 +-
 .../functional/dedup/dedup_legacy_create.ksh  |   6 +-
 .../dedup/dedup_legacy_fdt_mixed.ksh          |   4 +-
 .../dedup/dedup_legacy_fdt_upgrade.ksh        |   6 +-
 .../functional/dedup/dedup_legacy_import.ksh  |   4 +-
 .../tests/functional/dedup/dedup_prune.ksh    |   4 +-
 11 files changed, 135 insertions(+), 27 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index fbce8c8db65..003e1c35495 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -717,7 +717,8 @@ post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
-tests = ['dedup_bclone_pruned', 'dedup_fdt_create', 'dedup_fdt_import',
+tests = ['dedup_bclone', 'dedup_bclone_pruned', 'dedup_fdt_create',
+    'dedup_fdt_import',
     'dedup_fdt_pacing', 'dedup_legacy_create', 'dedup_legacy_import',
     'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota',
     'dedup_prune', 'dedup_prune_leak', 'dedup_zap_shrink']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 7c8dbfe5fcd..75b53c6ddd0 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1503,6 +1503,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
+	functional/dedup/dedup_bclone.ksh \
 	functional/dedup/dedup_bclone_pruned.ksh \
 	functional/dedup/dedup_fdt_create.ksh \
 	functional/dedup/dedup_fdt_import.ksh \
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh
new file mode 100755
index 00000000000..57f54d93ad4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh
@@ -0,0 +1,120 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+#
+# DESCRIPTION:
+#	Verify that block cloning interacts correctly with dedup when the DDT
+#	entry for the block is still present.  In this case brt_pending_apply_vdev()
+#	calls ddt_addref() which succeeds, so the extra reference is tracked in
+#	the DDT rather than in the BRT.
+#
+# STRATEGY:
+#	1. Create a pool with block_cloning enabled and dedup=on
+#	2. Write a file (4 blocks, unique DDT entries, refcnt=1)
+#	3. Clone the file - ddt_addref() bumps DDT refcnt to 2, entries move
+#	   from unique to duplicate table; no BRT entries are created
+#	4. Write a third copy via dd - DDT refcnt becomes 3
+#	5. Delete files in sequence, verifying DDT counts and zdb -b at each step
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_assert "Block cloning with live DDT entries uses ddt_addref, not BRT"
+
+# Flush DDT log every TXG so entries appear in the ZAP immediately.
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
+function cleanup
+{
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
+}
+
+log_onexit cleanup
+
+# we disable compression so our writes create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@block_cloning=enabled \
+    -O dedup=on \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS
+typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+
+# Write unique data: 4 blocks, each gets a DDT entry with refcnt=1.
+log_must dd if=/dev/urandom of=$mountpoint/file1 bs=128k count=4
+sync_pool $TESTPOOL
+
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'"
+
+# Clone file1.  The extra reference goes into the DDT rather than the BRT.
+# The entries move from unique (refcnt=1) to duplicate (refcnt=2).
+log_must clonefile -f $mountpoint/file1 $mountpoint/clone1
+sync_pool $TESTPOOL
+
+log_must eval \
+    "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'"
+log_must zdb -b $TESTPOOL
+
+# Write a third copy via dd — DDT refcnt becomes 3.
+log_must dd if=$mountpoint/file1 of=$mountpoint/file2 bs=128k
+sync_pool $TESTPOOL
+
+log_must eval \
+    "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'"
+log_must zdb -b $TESTPOOL
+
+# Delete the clone — DDT refcnt drops to 2, still duplicate.
+log_must rm $mountpoint/clone1
+sync_pool $TESTPOOL
+
+log_must eval \
+    "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'"
+log_must zdb -b $TESTPOOL
+
+# Delete file2 — DDT refcnt drops to 1, entries move back to unique.
+log_must rm $mountpoint/file2
+sync_pool $TESTPOOL
+
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'"
+log_must zdb -b $TESTPOOL
+
+# Delete the original — DDT empty, blocks freed.
+log_must rm $mountpoint/file1
+sync_pool $TESTPOOL
+
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+log_must zdb -b $TESTPOOL
+
+log_pass "Block cloning with live DDT entries uses ddt_addref, not BRT"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
index 6e67a46b040..11e2461d936 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -44,14 +44,12 @@ function cleanup
 
 log_onexit cleanup
 
-# create a pool with fast dedup enabled. we disable block cloning to ensure
-# it doesn't get in the way of dedup, and we disable compression so our writes
+# create a pool with fast dedup enabled. we disable compression so our writes
 # create predictable results on disk
 # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
 log_must zpool create -f \
     -o feature@fast_dedup=enabled \
     -O dedup=on \
-    -o feature@block_cloning=disabled \
     -O compression=off \
     -O xattr=sa \
     $TESTPOOL $DISKS
@@ -81,7 +79,7 @@ obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
 log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
 
 # copy the file
-log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128k
 log_must zpool sync
 
 # now four entries in the duplicate table
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
index 3a90d656d00..1885daf4489 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -44,14 +44,12 @@ function cleanup
 
 log_onexit cleanup
 
-# create a pool with fast dedup enabled. we disable block cloning to ensure
-# it doesn't get in the way of dedup, and we disable compression so our writes
+# create a pool with fast dedup enabled. we disable compression so our writes
 # create predictable results on disk
 # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
 log_must zpool create -f \
     -o feature@fast_dedup=enabled \
     -O dedup=on \
-    -o feature@block_cloning=disabled \
     -O compression=off \
     -O xattr=sa \
     $TESTPOOL $DISKS
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
index 1fc598c5dd2..2bebed6965f 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
@@ -46,11 +46,9 @@ function cleanup
 
 log_onexit cleanup
 
-# Create a pool with fast dedup enabled. We disable block cloning to ensure
-# it doesn't get in the way of dedup.
+# Create a pool with fast dedup enabled.
 log_must zpool create -f \
     -o feature@fast_dedup=enabled \
-    -o feature@block_cloning=disabled \
     $TESTPOOL $DISKS
 
 # Create a filesystem with a small recordsize so that we get more DDT entries,
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
index 4422502452b..cc9a8694724 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
@@ -37,14 +37,12 @@ function cleanup
 
 log_onexit cleanup
 
-# create a pool with legacy dedup enabled. we disable block cloning to ensure
-# it doesn't get in the way of dedup, and we disable compression so our writes
+# create a pool with legacy dedup enabled. we disable compression so our writes
 # create predictable results on disk
 # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
 log_must zpool create -f \
     -o feature@fast_dedup=disabled \
     -O dedup=on \
-    -o feature@block_cloning=disabled \
     -O compression=off \
     -O xattr=sa \
     $TESTPOOL $DISKS
@@ -70,7 +68,7 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'"
 log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
 
 # copy the file
-log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128k
 log_must zpool sync
 
 # now four entries in the duplicate table
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
index b51eae2ad08..03acaf09b39 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -45,13 +45,11 @@ function cleanup
 
 log_onexit cleanup
 
-# create a pool with legacy dedup enabled. we disable block cloning to ensure
-# it doesn't get in the way of dedup, and we disable compression so our writes
+# create a pool with legacy dedup enabled. we disable compression so our writes
 # create predictable results on disk
 # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
 log_must zpool create -f \
     -o feature@fast_dedup=disabled \
-    -o feature@block_cloning=disabled \
     -O compression=off \
     -O xattr=sa \
     $TESTPOOL $DISKS
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
index ece43036c07..2b610af1ebf 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -45,14 +45,12 @@ function cleanup
 
 log_onexit cleanup
 
-# create a pool with legacy dedup enabled. we disable block cloning to ensure
-# it doesn't get in the way of dedup, and we disable compression so our writes
+# create a pool with legacy dedup enabled. we disable compression so our writes
 # create predictable results on disk
 # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
 log_must zpool create -f \
     -o feature@fast_dedup=disabled \
     -O dedup=on \
-    -o feature@block_cloning=disabled \
     -O compression=off \
     -O xattr=sa \
     $TESTPOOL $DISKS
@@ -84,7 +82,7 @@ log_must zpool set feature@fast_dedup=enabled $TESTPOOL
 log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
 
 # copy the file
-log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128k
 log_must zpool sync
 
 # feature should still be enabled
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
index 550f51cdb82..c137f7b9499 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
@@ -37,14 +37,12 @@ function cleanup
 
 log_onexit cleanup
 
-# create a pool with legacy dedup enabled. we disable block cloning to ensure
-# it doesn't get in the way of dedup, and we disable compression so our writes
+# create a pool with legacy dedup enabled. we disable compression so our writes
 # create predictable results on disk
 # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
 log_must zpool create -f \
     -o feature@fast_dedup=disabled \
     -O dedup=on \
-    -o feature@block_cloning=disabled \
     -O compression=off \
     -O xattr=sa \
     $TESTPOOL $DISKS
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
index c0a2adb30c2..d80fbe9795d 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
@@ -69,12 +69,12 @@ function ddt_entries
 
 log_onexit cleanup
 
-log_must zpool create -f -o feature@block_cloning=disabled $TESTPOOL $DISKS
+log_must zpool create -f $TESTPOOL $DISKS
 
 log_must zfs create -o recordsize=512 -o dedup=on $TESTPOOL/$TESTFS
 typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
 log_must dd if=/dev/urandom of=$mountpoint/f1 bs=512k count=1
-log_must cp $mountpoint/f1 $mountpoint/f2
+log_must dd if=$mountpoint/f1 of=$mountpoint/f2 bs=512k
 sync_pool $TESTPOOL
 entries=$(ddt_entries)
 log_note "ddt entries before: $entries"

From 3e5713771d715111084b1a833518828fd97b4c6c Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 13 May 2026 16:34:35 -0700
Subject: [PATCH 027/129] ZTS: zhack_metaslab_leak.ksh busy export

If the pool is active 'zpool export' will fail resulting in
a test failure.  Swap log_must with log_must_busy so the export
is retried when reported as busy before failing the test.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18512
---
 .../tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh
index 0d2a39be6b5..c8a69c09aac 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh
@@ -31,7 +31,7 @@ verify_runnable "global"
 
 function cleanup
 {
-	zpool destroy $TESTPOOL
+	destroy_pool $TESTPOOL
 	rm $tmp
 }
 
@@ -58,7 +58,7 @@ log_must eval "zdb -m --allocated-map $TESTPOOL > $tmp"
 log_must zpool destroy $TESTPOOL
 
 log_must zpool create $TESTPOOL $DISKS
-log_must zpool export $TESTPOOL
+log_must_busy zpool export $TESTPOOL
 log_must eval "zhack metaslab leak $TESTPOOL < $tmp"
 log_must zpool import $TESTPOOL
 

From 47af5e4efcb7caf750aa1705a129297466fe331d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 13 May 2026 20:21:16 -0700
Subject: [PATCH 028/129] arc: export additional required symbols

External consumers of arc_read() need to be able to destroy the
returned arc_buf_t.  Add the arc_buf_destroy() interface as an
exported symbol.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18533
---
 module/zfs/arc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index b2dba3e0ff9..1f0bd5e4595 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -11686,6 +11686,7 @@ EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_buf_destroy);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 

From 8c3b0c7328ad8b64c787fbdf2917e838c10cf33d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 13 May 2026 20:22:58 -0700
Subject: [PATCH 029/129] Remove arc_bcopy_func() function

While this function could be convenient it appears it's never been
used.  In practice, callers end up using the arc_getbuf_func()
instead.  Remove this unused function.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18534
---
 include/sys/arc.h |  3 +--
 module/zfs/arc.c  | 14 --------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/include/sys/arc.h b/include/sys/arc.h
index 2b3668c6086..d6f025d0942 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -95,8 +95,7 @@ typedef void arc_prune_func_t(uint64_t bytes, void *priv);
 extern uint_t zfs_arc_average_blocksize;
 extern int l2arc_exclude_special;
 
-/* generic arc_done_func_t's which you can use */
-arc_read_done_func_t arc_bcopy_func;
+/* generic arc_done_func_t which can be used */
 arc_read_done_func_t arc_getbuf_func;
 
 /* generic arc_prune_func_t wrapper for callbacks */
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 1f0bd5e4595..b67e10b5c14 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5572,20 +5572,6 @@ arc_buf_access(arc_buf_t *buf)
 	    !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
-/* a generic arc_read_done_func_t which you can use */
-void
-arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
-    arc_buf_t *buf, void *arg)
-{
-	(void) zio, (void) zb, (void) bp;
-
-	if (buf == NULL)
-		return;
-
-	memcpy(arg, buf->b_data, arc_buf_size(buf));
-	arc_buf_destroy(buf, arg);
-}
-
 /* a generic arc_read_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,

From 8b24164f29da9835256412ec267f07e3d7db9a37 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 13 May 2026 20:24:17 -0700
Subject: [PATCH 030/129] CI: Fix 99.99 META version

We have an option in zfs-qemu-packages to test against a specific kernel
version.  However, qemu-3-deps.sh was incorrectly hard coded to look
at $2 for a kernel version argument (which could come in $2 or $3
depending on if --poweroff was also passed).  This caused the CI
to incorrectly edit META with a max supported kernel version of 99.99
when we didn't want that.

Fix this by looking at all the arguments for something that looks
like a kernel version and set that as the kernel max in META.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18526
Closes #18531
---
 .github/workflows/scripts/qemu-3-deps.sh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh
index 267ae4ad3c7..6e8dd6d7546 100755
--- a/.github/workflows/scripts/qemu-3-deps.sh
+++ b/.github/workflows/scripts/qemu-3-deps.sh
@@ -2,9 +2,12 @@
 # 3) Wait for VM to boot from previous step and launch dependencies
 #    script on it.
 #
-# $1: OS name (like 'fedora41')
-# $2: (optional) Experimental kernel version to install on fedora,
-#     like "6.14".
+# qemu-3-deps.sh [--poweroff] OS_NAME [FEDORA_VERSION]
+#
+# --poweroff: Power off the VM after installing dependencies
+# OS_NAME: OS name (like 'fedora41')
+# FEDORA_VERSION: (optional) Experimental Fedora kernel version, like "6.14" to
+#     install instead of Fedora defaults.
 ######################################################################
 
 .github/workflows/scripts/qemu-wait-for-vm.sh vm0
@@ -15,8 +18,13 @@
 # we need to update the kernel version in zfs's META file to allow the
 # build to happen.  We update our local copy of META here, since we know
 # it will be rsync'd up in the next step.
-if [ -n "${2:-}" ] ; then
-  sed -i -E 's/Linux-Maximum: .+/Linux-Maximum: 99.99/g' META
+#
+# Look to see if the last argument looks like a kernel version.
+ver="${@: -1}"
+if [[ $ver =~ ^[0-9]+\.[0-9]+ ]] ; then
+  # We got a kernel version, update META to say we support it so we
+  # can test against it.
+  sed -i -E 's/Linux-Maximum: .+/Linux-Maximum: '$ver'/g' META
 fi
 
 scp .github/workflows/scripts/qemu-3-deps-vm.sh zfs@vm0:qemu-3-deps-vm.sh

From be6b6ea8c6ae7e7519a62a2a322f4db499f224d9 Mon Sep 17 00:00:00 2001
From: ZhengYuan Huang <gality369@gmail.com>
Date: Thu, 14 May 2026 12:53:14 +0800
Subject: [PATCH 031/129] linux: suppress reclaim lockdep in zfs_inactive via
 rwlock wrappers

kswapd can enter zfs_inactive() from inode reclaim while holding
fs_reclaim. The z_teardown_inactive_lock still serializes teardown,
but the reclaim-thread acquire/release pair can produce a lockdep
cycle through zfs_zinactive() and zfs_rmnode().

Add Linux rwlock nolockdep wrappers alongside the existing rwlock
macros and use them only for the reclaim-thread
z_teardown_inactive_lock acquire/release in zfs_inactive(). Keep
the real rwsem semantics unchanged and leave CONFIG_LOCKDEP
handling in the platform rwlock layer.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
Closes #18505
---
 include/os/linux/spl/sys/rwlock.h  | 136 +++++++++++++++++++++--------
 module/os/linux/zfs/zfs_vnops_os.c |  30 +++++--
 2 files changed, 124 insertions(+), 42 deletions(-)

diff --git a/include/os/linux/spl/sys/rwlock.h b/include/os/linux/spl/sys/rwlock.h
index c883836c2f8..64361bea90e 100644
--- a/include/os/linux/spl/sys/rwlock.h
+++ b/include/os/linux/spl/sys/rwlock.h
@@ -30,7 +30,6 @@
 #include <linux/sched.h>
 
 typedef enum {
-	RW_DRIVER	= 2,
 	RW_DEFAULT	= 4,
 	RW_NOLOCKDEP	= 5
 } krw_type_t;
@@ -75,20 +74,35 @@ spl_rw_set_type(krwlock_t *rwp, krw_type_t type)
 {
 	rwp->rw_type = type;
 }
+
+static inline void
+spl_rw_lockdep_off(void)
+{
+	lockdep_off();
+}
+
+static inline void
+spl_rw_lockdep_on(void)
+{
+	lockdep_on();
+}
+
 static inline void
 spl_rw_lockdep_off_maybe(krwlock_t *rwp)		\
 {							\
 	if (rwp && rwp->rw_type == RW_NOLOCKDEP)	\
-		lockdep_off();				\
+		spl_rw_lockdep_off();			\
 }
 static inline void
 spl_rw_lockdep_on_maybe(krwlock_t *rwp)			\
 {							\
 	if (rwp && rwp->rw_type == RW_NOLOCKDEP)	\
-		lockdep_on();				\
+		spl_rw_lockdep_on();			\
 }
 #else  /* CONFIG_LOCKDEP */
 #define	spl_rw_set_type(rwp, type)
+#define	spl_rw_lockdep_off()
+#define	spl_rw_lockdep_on()
 #define	spl_rw_lockdep_off_maybe(rwp)
 #define	spl_rw_lockdep_on_maybe(rwp)
 #endif /* CONFIG_LOCKDEP */
@@ -117,6 +131,56 @@ RW_READ_HELD(krwlock_t *rwp)
  * will be correctly located in the users code which is important
  * for the built in kernel lock analysis tools
  */
+#define	spl_rw_tryenter_impl(rwp, rw) /* CSTYLED */			\
+({									\
+	int _rc_ = 0;							\
+									\
+	switch (rw) {							\
+	case RW_READER:							\
+		_rc_ = down_read_trylock(SEM(rwp));			\
+		break;							\
+	case RW_WRITER:							\
+		if ((_rc_ = down_write_trylock(SEM(rwp))))		\
+			spl_rw_set_owner(rwp);				\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+	_rc_;								\
+})
+
+#define	spl_rw_enter_impl(rwp, rw) /* CSTYLED */			\
+({									\
+	switch (rw) {							\
+	case RW_READER:							\
+		down_read(SEM(rwp));					\
+		break;							\
+	case RW_WRITER:							\
+		down_write(SEM(rwp));					\
+		spl_rw_set_owner(rwp);					\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+})
+
+#define	spl_rw_exit_impl(rwp) /* CSTYLED */				\
+({									\
+	if (RW_WRITE_HELD(rwp)) {					\
+		spl_rw_clear_owner(rwp);				\
+		up_write(SEM(rwp));					\
+	} else {							\
+		ASSERT(RW_READ_HELD(rwp));				\
+		up_read(SEM(rwp));					\
+	}								\
+})
+
+#define	spl_rw_downgrade_impl(rwp) /* CSTYLED */			\
+({									\
+	spl_rw_clear_owner(rwp);					\
+	downgrade_write(SEM(rwp));					\
+})
+
 #define	rw_init(rwp, name, type, arg) /* CSTYLED */			\
 ({									\
 	static struct lock_class_key __key;				\
@@ -140,60 +204,60 @@ RW_READ_HELD(krwlock_t *rwp)
 
 #define	rw_tryenter(rwp, rw) /* CSTYLED */				\
 ({									\
-	int _rc_ = 0;							\
-									\
 	spl_rw_lockdep_off_maybe(rwp);					\
-	switch (rw) {							\
-	case RW_READER:							\
-		_rc_ = down_read_trylock(SEM(rwp));			\
-		break;							\
-	case RW_WRITER:							\
-		if ((_rc_ = down_write_trylock(SEM(rwp))))		\
-			spl_rw_set_owner(rwp);				\
-		break;							\
-	default:							\
-		VERIFY(0);						\
-	}								\
+	int _rc_ = spl_rw_tryenter_impl(rwp, rw);			\
 	spl_rw_lockdep_on_maybe(rwp);					\
 	_rc_;								\
 })
 
+#define	rw_tryenter_nolockdep(rwp, rw) /* CSTYLED */			\
+({									\
+	spl_rw_lockdep_off();						\
+	int _rc_ = spl_rw_tryenter_impl(rwp, rw);			\
+	spl_rw_lockdep_on();						\
+	_rc_;								\
+})
+
 #define	rw_enter(rwp, rw) /* CSTYLED */					\
 ({									\
 	spl_rw_lockdep_off_maybe(rwp);					\
-	switch (rw) {							\
-	case RW_READER:							\
-		down_read(SEM(rwp));					\
-		break;							\
-	case RW_WRITER:							\
-		down_write(SEM(rwp));					\
-		spl_rw_set_owner(rwp);					\
-		break;							\
-	default:							\
-		VERIFY(0);						\
-	}								\
+	spl_rw_enter_impl(rwp, rw);					\
 	spl_rw_lockdep_on_maybe(rwp);					\
 })
 
+#define	rw_enter_nolockdep(rwp, rw) /* CSTYLED */			\
+({									\
+	spl_rw_lockdep_off();						\
+	spl_rw_enter_impl(rwp, rw);					\
+	spl_rw_lockdep_on();						\
+})
+
 #define	rw_exit(rwp) /* CSTYLED */					\
 ({									\
 	spl_rw_lockdep_off_maybe(rwp);					\
-	if (RW_WRITE_HELD(rwp)) {					\
-		spl_rw_clear_owner(rwp);				\
-		up_write(SEM(rwp));					\
-	} else {							\
-		ASSERT(RW_READ_HELD(rwp));				\
-		up_read(SEM(rwp));					\
-	}								\
+	spl_rw_exit_impl(rwp);						\
 	spl_rw_lockdep_on_maybe(rwp);					\
 })
 
+#define	rw_exit_nolockdep(rwp) /* CSTYLED */				\
+({									\
+	spl_rw_lockdep_off();						\
+	spl_rw_exit_impl(rwp);						\
+	spl_rw_lockdep_on();						\
+})
+
 #define	rw_downgrade(rwp) /* CSTYLED */					\
 ({									\
 	spl_rw_lockdep_off_maybe(rwp);					\
-	spl_rw_clear_owner(rwp);					\
-	downgrade_write(SEM(rwp));					\
+	spl_rw_downgrade_impl(rwp);					\
 	spl_rw_lockdep_on_maybe(rwp);					\
 })
 
+#define	rw_downgrade_nolockdep(rwp) /* CSTYLED */			\
+({									\
+	spl_rw_lockdep_off();						\
+	spl_rw_downgrade_impl(rwp);					\
+	spl_rw_lockdep_on();						\
+})
+
 #endif /* _SPL_RWLOCK_H */
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 1e1e663b1f7..d6dad70ae09 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -4078,18 +4078,32 @@ zfs_inactive(struct inode *ip)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	krwlock_t *zti_lock = &zfsvfs->z_teardown_inactive_lock;
 	uint64_t atime[2];
 	int error;
 	int need_unlock = 0;
+	boolean_t no_lockdep = B_FALSE;
 
 	/* Only read lock if we haven't already write locked, e.g. rollback */
-	if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
+	if (!RW_WRITE_HELD(zti_lock)) {
 		need_unlock = 1;
-		rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+		/*
+		 * kswapd reaches evict_inode() with fs_reclaim held.  Suppress
+		 * lockdep only for this reclaim-thread acquire/release pair.
+		 */
+		no_lockdep = current_is_reclaim_thread();
+		if (no_lockdep)
+			rw_enter_nolockdep(zti_lock, RW_READER);
+		else
+			rw_enter(zti_lock, RW_READER);
 	}
 	if (zp->z_sa_hdl == NULL) {
-		if (need_unlock)
-			rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		if (need_unlock) {
+			if (no_lockdep)
+				rw_exit_nolockdep(zti_lock);
+			else
+				rw_exit(zti_lock);
+		}
 		return;
 	}
 
@@ -4115,8 +4129,12 @@ zfs_inactive(struct inode *ip)
 	}
 
 	zfs_zinactive(zp);
-	if (need_unlock)
-		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	if (need_unlock) {
+		if (no_lockdep)
+			rw_exit_nolockdep(zti_lock);
+		else
+			rw_exit(zti_lock);
+	}
 }
 
 /*

From fed1b58a6da747f6d7941afaab19b7cdcfeabbcd Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Thu, 14 May 2026 12:26:35 +1000
Subject: [PATCH 032/129] zap: fix refcount tag use in zap_lookup_length_uint64
 and zap_prefetch_uint64

The same tag must be used for zap_lockdir() and zap_unlockdir(), so we have
to follow the pattern used elsewhere: pass the tag used for
zap_lockdir() through to the _impl(), so it can use it for
zap_unlockdir().

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18536
---
 module/zfs/zap.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 4c4aec07c91..609e2f1128f 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -320,18 +320,18 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
 static int
 zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
-    uint64_t *actual_num_integers)
+    uint64_t *actual_num_integers, const void *tag)
 {
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	int err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL, actual_num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	zap_unlockdir(zap, tag);
 	return (err);
 }
 
@@ -346,7 +346,7 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	if (err != 0)
 		return (err);
 	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, NULL);
+	    integer_size, num_integers, buf, NULL, FTAG);
 	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
@@ -362,7 +362,7 @@ zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	if (err != 0)
 		return (err);
 	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, NULL);
+	    integer_size, num_integers, buf, NULL, FTAG);
 	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
@@ -379,7 +379,7 @@ zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	if (err != 0)
 		return (err);
 	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, actual_num_integers);
+	    integer_size, num_integers, buf, actual_num_integers, FTAG);
 	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
@@ -423,17 +423,18 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 /* zap_prefetch_uint64 */
 
 static int
-zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
+zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    const void *tag)
 {
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	zap_unlockdir(zap, tag);
 	return (0);
 }
 
@@ -447,7 +448,7 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	err = zap_prefetch_uint64_impl(zap, key, key_numints);
+	err = zap_prefetch_uint64_impl(zap, key, key_numints, FTAG);
 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
@@ -461,7 +462,7 @@ zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	err = zap_prefetch_uint64_impl(zap, key, key_numints);
+	err = zap_prefetch_uint64_impl(zap, key, key_numints, FTAG);
 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }

From edb9af386ec13d969ab07369f4648c3c4f3c1131 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 13 May 2026 16:17:56 +1000
Subject: [PATCH 033/129] ddt_log: fix refcount tag between ddt_log_begin &
 ddt_log_commit

We have to hold and release the dbuf array with the same tag. Since the
caller provides the ddt_log_update_t and is managing its lifetime, and
the begin/commit calls must be matched, it's quite reasonable to its
pointer as the refcount tag.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18536
---
 module/zfs/ddt_log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
index 51ce8b9a084..7e699a9b425 100644
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@@ -221,7 +221,7 @@ ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
 	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
-	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    B_FALSE, dlu, &dlu->dlu_ndbp, &dlu->dlu_dbp,
 	    DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));
 
 	dlu->dlu_tx = tx;
@@ -338,7 +338,7 @@ ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
 	 */
 	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
 
-	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, dlu);
 
 	ddt->ddt_log_active->ddl_length +=
 	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;

From 7012b46b7bfa4d7cb4681e3195162eb3b3c1a3dc Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 13 May 2026 20:22:37 +1000
Subject: [PATCH 034/129] dsl_bookmark: fix redaction list refcount tag when
 upgrading spill

rl_bonus and rl_dbuf are expected to have the same hold tag if they are
different. If the spill hold is taken after the redaction_list_t was
created and the bonus hold was taken, it must also be taken with the
same tag. Fortunately, we have it right here, so we can just use it.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18536
---
 module/zfs/dsl_bookmark.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index 4ffd75ceace..b0354203d42 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -490,7 +490,7 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
 		} else {
 			dmu_buf_t *db;
 			VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
-			    DB_RF_MUST_SUCCEED, FTAG, &db));
+			    DB_RF_MUST_SUCCEED, tag, &db));
 			dmu_buf_will_fill(db, tx, B_FALSE);
 			VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
 			    SPA_MINBLOCKSIZE), tx));

From 3800525cd2cceeb0a42cc14d43873a8615e397ec Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 15 May 2026 09:14:36 -0700
Subject: [PATCH 035/129] Fix aarch64 build failure by removing earlyclobber
 (#18532)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The UVR macros used "+&w" (read-write + earlyclobber) as the
constraint for NEON register operands that are declared as explicit
hard-register variables via:

register unsigned char wN asm("vN") __attribute__((vector_size(16)));

The + modifier implicitly makes the operand also an input (reading the
register before the asm runs). The & (earlyclobber) modifier says "this
output may be written before all inputs are consumed." Having an
earlyclobber output on the same hard-register that is simultaneously
an input is a contradiction — GCC 16 now strictly diagnoses this.

The fix removes the & from "+&w", yielding "+w". The earlyclobber
was both incorrect (contradicts the implicit input) and unnecessary
(the physical registers are already hard-bound, so the compiler has no
freedom to assign conflicting registers anyway).


Issue #18525

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 .../zfs/vdev_raidz_math_aarch64_neon_common.h  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
index 1ec4d0218bb..3c3370290c8 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h
+++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -102,14 +102,14 @@
 
 #define	WVR(X) [w##X] "=w" (w##X)
 
-#define	UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
-#define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR0_(REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR1_(_1, REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR2_(_1, _2, REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+w" (w##REG)
+#define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+w" (w##REG)
 
 #define	UVR0(r...) UVR0_(r)
 #define	UVR1(r...) UVR1_(r)
@@ -120,7 +120,7 @@
 #define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
 #define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
 
-#define	UVR(X) [w##X] "+&w" (w##X)
+#define	UVR(X) [w##X] "+w" (w##X)
 
 #define	R_01(REG1, REG2, ...) REG1, REG2
 #define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3

From 9e9a012a0300b1bc17caddb1adbea9a0628c3414 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 15 May 2026 10:12:39 -0700
Subject: [PATCH 036/129] CI: Remove deprecated Fedora 42

Fedora 42 was deprecated on May 13 2026.  Remove it from CI tests.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18545
---
 .github/workflows/scripts/qemu-2-start.sh    |  5 -----
 .github/workflows/scripts/qemu-4-build-vm.sh |  2 +-
 .github/workflows/zfs-qemu-packages.yml      |  2 +-
 .github/workflows/zfs-qemu.yml               | 10 +++++-----
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 3c1f456ed0c..7f27eeffed6 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -78,11 +78,6 @@ case "$OS" in
     OPTS[0]="--boot"
     OPTS[1]="uefi=on"
     ;;
-  fedora42)
-    OSNAME="Fedora 42"
-    OSv="fedora-unknown"
-    URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2"
-    ;;
   fedora43)
     OSNAME="Fedora 43"
     OSv="fedora-unknown"
diff --git a/.github/workflows/scripts/qemu-4-build-vm.sh b/.github/workflows/scripts/qemu-4-build-vm.sh
index bbfa2ec85b8..bd77f2c5ca1 100755
--- a/.github/workflows/scripts/qemu-4-build-vm.sh
+++ b/.github/workflows/scripts/qemu-4-build-vm.sh
@@ -337,7 +337,7 @@ fi
 #
 # rhel8.10
 # almalinux9.5
-# fedora42
+# fedora44
 source /etc/os-release
  if which hostnamectl &> /dev/null ; then
   # Fedora 42+ use hostnamectl
diff --git a/.github/workflows/zfs-qemu-packages.yml b/.github/workflows/zfs-qemu-packages.yml
index c3a7397c6ae..25afb77233c 100644
--- a/.github/workflows/zfs-qemu-packages.yml
+++ b/.github/workflows/zfs-qemu-packages.yml
@@ -58,7 +58,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora42', 'fedora43', 'fedora44']
+        os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora43', 'fedora44']
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v6
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index f07988f701d..9a594db6854 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -14,7 +14,7 @@ on:
         type: string
         required: false
         default: ""
-        description: "(optional) Only run on this specific OS (like 'fedora42' or 'alpine3-23')"
+        description: "(optional) Only run on this specific OS (like 'fedora44' or 'alpine3-23')"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -49,23 +49,23 @@ jobs:
             os_selection='[]'
             ;;
           quick)
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd15-1s", "ubuntu24"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora44", "freebsd15-1s", "ubuntu24"]'
             ;;
           linux)
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora42", "fedora43", "fedora44", "ubuntu22", "ubuntu24"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24"]'
             ;;
           freebsd)
             os_selection='["freebsd13-5r", "freebsd14-4r", "freebsd13-5s", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]'
             ;;
           *)
             # default list
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora42", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24"]'
             ;;
           esac
 
           # Repository-level override for OS selection.
           # Set vars.ZTS_OS_OVERRIDE in repo settings to restrict targets
-          # (e.g. '["debian13"]' or '["debian13", "fedora42"]').
+          # (e.g. '["debian13"]' or '["debian13", "fedora44"]').
           # Manual ZFS-CI-Type in commit messages bypasses the override.
           if [ -n "${{ vars.ZTS_OS_OVERRIDE }}" ] && [ "$ci_source" != "manual" ]; then
             override='${{ vars.ZTS_OS_OVERRIDE }}'

From 839ec56120cfae79170e863632d114d78c6311e5 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 10:07:12 +1000
Subject: [PATCH 037/129] zstream: dump backtrace on crash

Same method as zdb and ztest. zstream doesn't get touched much, and
plays a bit fast-and-loose with some core code. Its not hard for a
change to make it crash; this makes debugging easier when it does.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18535
---
 cmd/zstream/zstream.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/cmd/zstream/zstream.c b/cmd/zstream/zstream.c
index f1a2fa75740..da74ab6e1e5 100644
--- a/cmd/zstream/zstream.c
+++ b/cmd/zstream/zstream.c
@@ -29,6 +29,8 @@
 #include <libintl.h>
 #include <stddef.h>
 #include <libzfs.h>
+#include <signal.h>
+#include <sys/backtrace.h>
 #include "zstream.h"
 
 void
@@ -53,9 +55,43 @@ zstream_usage(void)
 	exit(1);
 }
 
+static void sig_handler(int signo)
+{
+	struct sigaction action;
+	libspl_backtrace(STDERR_FILENO);
+
+	/*
+	 * Restore default action and re-raise signal so SIGSEGV and
+	 * SIGABRT can trigger a core dump.
+	 */
+	action.sa_handler = SIG_DFL;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	(void) sigaction(signo, &action, NULL);
+	raise(signo);
+}
+
+
 int
 main(int argc, char *argv[])
 {
+	/*
+	 * Set up signal handlers, so if we crash due to bad data in the stream
+	 * we can get more info. Unlike ztest, we don't bail out if we can't
+	 * set up signal handlers, because zstream is very useful without them.
+	 */
+	struct sigaction action = { .sa_handler = sig_handler };
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	if (sigaction(SIGSEGV, &action, NULL) < 0) {
+		(void) fprintf(stderr, "zstream: cannot catch SIGSEGV: %s\n",
+		    strerror(errno));
+	}
+	if (sigaction(SIGABRT, &action, NULL) < 0) {
+		(void) fprintf(stderr, "zstream: cannot catch SIGABRT: %s\n",
+		    strerror(errno));
+	}
+
 	char *basename = strrchr(argv[0], '/');
 	basename = basename ? (basename + 1) : argv[0];
 	if (argc >= 1 && strcmp(basename, "zstreamdump") == 0)

From 2fa83c008edea2294309d5caf50ca98f1b0a163b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 10:22:05 +1000
Subject: [PATCH 038/129] zstream: init/fini refcount tracking

When compiled with ZFS_DEBUG and reference_tracking_enable is enabled,
ABD alloc/free will have real refcount tracking, which will crash if the
reference cache hasn't been initialised. Adding it to the init & fini
lists is the quickest way to get that going again.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18535
---
 cmd/zstream/zstream_recompress.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index 5092b534a8f..f5abfa98b18 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -99,6 +99,7 @@ zstream_do_recompress(int argc, char *argv[])
 		exit(1);
 	}
 
+	zfs_refcount_init();
 	abd_init();
 	fletcher_4_init();
 	zio_init();
@@ -353,6 +354,7 @@ zstream_do_recompress(int argc, char *argv[])
 	zio_fini();
 	zstd_fini();
 	abd_fini();
+	zfs_refcount_fini();
 
 	return (0);
 }

From f4a8b0f731504faf895879fb06611109c49b3e6d Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 15 May 2026 10:35:48 -0700
Subject: [PATCH 039/129] CI: Allow testing with a newer GCC on ARM builder

Add a text box to specify a custom GCC version (like '16') when
running the zfs-arm builder.  This allows you to test with a newer
GCC than the Ubuntu default.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18540
---
 .github/workflows/zfs-arm.yml | 38 ++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/zfs-arm.yml b/.github/workflows/zfs-arm.yml
index 6039e4736c4..b6d6444c2dd 100644
--- a/.github/workflows/zfs-arm.yml
+++ b/.github/workflows/zfs-arm.yml
@@ -4,6 +4,12 @@ on:
   push:
   pull_request:
   workflow_dispatch:
+    inputs:
+      gcc_ver:
+        type: string
+        required: false
+        default: ""
+        description: "(optional) install specific GCC version, like '16'"
 
 jobs:
   zfs-arm:
@@ -18,6 +24,31 @@ jobs:
       timeout-minutes: 20
       run: |
         sudo apt-get -y remove firefox || true
+
+        # Do we want to test with a custom GCC version?
+        if [ "${{ github.event.inputs.gcc_ver }}" != "" ] ; then
+          ver="${{ github.event.inputs.gcc_ver }}"
+
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test
+          sudo apt-get update
+
+          echo "GCCs available:"
+          awk '/Package: gcc-/{print $2}'  /var/lib/apt/lists/*ubuntu-toolchain-r*Packages
+
+          sudo apt-get -y install gcc g++ gcc-$ver g++-$ver
+
+          sudo update-alternatives --remove-all gcc || true 2>&1
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-$ver 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-$ver 100
+          sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100
+          sudo update-alternatives --set cc /usr/bin/gcc
+          sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+          sudo update-alternatives --set c++ /usr/bin/g++
+
+          sudo update-alternatives --set gcc "/usr/bin/gcc-$ver"
+          sudo update-alternatives --set g++ "/usr/bin/g++-$ver"
+        fi
+
         .github/workflows/scripts/qemu-3-deps-vm.sh ubuntu24
 
         # We're running the VM scripts locally on the runner, so need to fix
@@ -28,7 +59,12 @@ jobs:
     - name: Build modules
       timeout-minutes: 30
       run: |
-        .github/workflows/scripts/qemu-4-build-vm.sh --enable-debug ubuntu24
+        # Even though we may have installed a newer GCC, the kernel builds don't
+        # seem to honor it, and instead use the older GCC.  I assume this is
+        # to match up with whatever GCC version was used for the kernel.  Always
+        # specify KERNEL_CC to get around this.  This works when using the
+        # default GCC and with a custom GCC.
+        KERNEL_CC=/usr/bin/gcc .github/workflows/scripts/qemu-4-build-vm.sh --enable-debug ubuntu24
 
         # Quick sanity test since we're not running the full ZTS
         sudo modprobe zfs

From e4b0d59da997005928e67ae5d0be7b6d03ab4659 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 17:54:10 +1000
Subject: [PATCH 040/129] zap: rename 'lockdir' to 'lock'

The "dir" part is a holdover from prehistoric times, where ZAPs were
just the filesystem directory object.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
---
 include/sys/zap_impl.h |   8 +--
 module/zfs/zap.c       | 138 ++++++++++++++++++++---------------------
 module/zfs/zap_fat.c   |  12 ++--
 module/zfs/zap_impl.c  |  12 ++--
 module/zfs/zap_micro.c |   9 ++-
 5 files changed, 89 insertions(+), 90 deletions(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 15cc96df3d1..85f9e5d8979 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -248,19 +248,19 @@ uint64_t zap_hash(zap_name_t *zn);
  * Return a zap_t for the given on-disk object, locked and ready for use.
  * The zap_t will be allocated and loaded from disk if its not already loaded.
  */
-int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+int zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp);
-int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp);
 
 /* Underlying implementation for above; do not use. */
-int zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+int zap_lock_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 
 /* Unlock and release a zap_t. */
-void zap_unlockdir(zap_t *zap, const void *tag);
+void zap_unlock(zap_t *zap, const void *tag);
 
 /* zap_t release function for when associated dbuf is evicted. */
 void zap_evict_sync(void *dbu);
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 609e2f1128f..f319469e55e 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -288,12 +288,12 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_impl(zap, name, integer_size,
 	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -305,13 +305,13 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
 {
 	zap_t *zap;
 
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
 	    FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_impl(zap, name, integer_size,
 	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -324,14 +324,14 @@ zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
 {
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	int err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL, actual_num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -342,12 +342,12 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, buf, NULL, FTAG);
-	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
+	/* zap_lookup_length_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -358,12 +358,12 @@ zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, buf, NULL, FTAG);
-	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
+	/* zap_lookup_length_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -375,12 +375,12 @@ zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, buf, actual_num_integers, FTAG);
-	/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
+	/* zap_lookup_length_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -405,18 +405,18 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	err = zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -428,13 +428,13 @@ zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
 {
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
-	zap_unlockdir(zap, tag);
+	zap_unlock(zap, tag);
 	return (0);
 }
 
@@ -445,11 +445,11 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_prefetch_uint64_impl(zap, key, key_numints, FTAG);
-	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
+	/* zap_prefetch_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -459,11 +459,11 @@ zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_prefetch_uint64_impl(zap, key, key_numints, FTAG);
-	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
+	/* zap_prefetch_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -496,7 +496,7 @@ zap_add_impl(zap_t *zap, const char *key,
 
 	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
 	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@@ -522,7 +522,7 @@ zap_add_impl(zap_t *zap, const char *key,
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -534,11 +534,11 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key,
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	err = zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlockdir() */
+	/* zap_add_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -550,11 +550,11 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	err = zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlockdir() */
+	/* zap_add_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -569,14 +569,14 @@ zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -588,12 +588,12 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	/* zap_add_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -605,12 +605,12 @@ zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	/* zap_add_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -624,12 +624,12 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	const uint64_t *intval = val;
 
 	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@@ -659,7 +659,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -674,14 +674,14 @@ zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -693,12 +693,12 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_update_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	/* zap_update_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -709,12 +709,12 @@ zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_update_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	/* zap_update_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -727,12 +727,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@@ -750,7 +750,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -763,17 +763,17 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -783,18 +783,18 @@ zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 {
 	zap_t *zap;
 
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
 	    FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -838,11 +838,11 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	err = zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	err = zap_remove_impl(zap, name, 0, tx);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -853,11 +853,11 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	err = zap_lock(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	err = zap_remove_impl(zap, name, mt, tx);
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -871,12 +871,12 @@ zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
+		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
-	zap_unlockdir(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -887,11 +887,11 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
-	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	/* zap_remove_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -902,11 +902,11 @@ zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
-	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	/* zap_remove_uint64_impl() calls zap_unlock() */
 	return (err);
 }
 
@@ -918,7 +918,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	if (!zap->zap_ismicro) {
@@ -926,7 +926,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -935,7 +935,7 @@ zap_count_by_dnode(dnode_t *dn, uint64_t *count)
 {
 	zap_t *zap;
 
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
 	    FTAG, &zap);
 	if (err != 0)
 		return (err);
@@ -944,7 +944,7 @@ zap_count_by_dnode(dnode_t *dn, uint64_t *count)
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -1189,7 +1189,7 @@ zap_cursor_fini(zap_cursor_t *zc)
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap, NULL);
+		zap_unlock(zc->zc_zap, NULL);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
@@ -1210,7 +1210,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 
 	if (zc->zc_zap == NULL) {
 		int hb;
-		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+		err = zap_lock(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
 		if (err != 0)
 			return (err);
@@ -1305,7 +1305,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	zap_t *zap;
 
 	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 
@@ -1318,7 +1318,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	} else {
 		fzap_get_stats(zap, zs);
 	}
-	zap_unlockdir(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (0);
 }
 
diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c
index 6e2f076cfc3..2d068f64671 100644
--- a/module/zfs/zap_fat.c
+++ b/module/zfs/zap_fat.c
@@ -730,8 +730,8 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 
 		zap_put_leaf(l);
 		*lp = l = NULL;
-		zap_unlockdir(zap, tag);
-		err = zap_lockdir(os, object, tx, RW_WRITER,
+		zap_unlock(zap, tag);
+		err = zap_lock(os, object, tx, RW_WRITER,
 		    FALSE, FALSE, tag, &zn->zn_zap);
 		zap = zn->zn_zap;
 		if (err != 0)
@@ -817,8 +817,8 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
 			objset_t *os = zap->zap_objset;
 			uint64_t zapobj = zap->zap_object;
 
-			zap_unlockdir(zap, tag);
-			int err = zap_lockdir(os, zapobj, tx,
+			zap_unlock(zap, tag);
+			int err = zap_lock(os, zapobj, tx,
 			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 			zap = zn->zn_zap;
 			if (err != 0)
@@ -1401,8 +1401,8 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 
 			/*
 			 * Usually, the right way to upgrade from a READER lock
-			 * to a WRITER lock is to call zap_unlockdir() and
-			 * zap_lockdir(), but we do not have a tag. Instead,
+			 * to a WRITER lock is to call zap_unlock() and
+			 * zap_lock(), but we do not have a tag. Instead,
 			 * we do it in more sophisticated way.
 			 */
 			rw_exit(&zap->zap_rwlock);
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index c70fce67875..f50b77e591a 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -290,7 +290,7 @@ zap_hash(zap_name_t *zn)
  * have the specified tag.
  */
 int
-zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+zap_lock_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -389,7 +389,7 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 }
 
 int
-zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
@@ -399,7 +399,7 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
-	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	err = zap_lock_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
 	if (err != 0)
 		dmu_buf_rele(db, tag);
 	else
@@ -408,7 +408,7 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
 }
 
 int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
@@ -424,7 +424,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 		dnode_rele(dn, tag);
 		return (err);
 	}
-	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	err = zap_lock_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
 	if (err != 0) {
 		dmu_buf_rele(db, tag);
 		dnode_rele(dn, tag);
@@ -433,7 +433,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 }
 
 void
-zap_unlockdir(zap_t *zap, const void *tag)
+zap_unlock(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
 	dnode_rele(zap->zap_dnode, tag);
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index b094b113971..1f98723f388 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -255,9 +255,8 @@ mzap_open(dmu_buf_t *db)
 	}
 
 	/*
-	 * Make sure that zap_ismicro is set before we let others see
-	 * it, because zap_lockdir() checks zap_ismicro without the lock
-	 * held.
+	 * Make sure that zap_ismicro is set before we let others see it,
+	 * because zap_lock() checks zap_ismicro without the lock held.
 	 */
 	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
 	winner = dmu_buf_set_user(db, &zap->zap_dbu);
@@ -407,10 +406,10 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(dnode_add_ref(dn, FTAG));
-		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
+		VERIFY0(zap_lock_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
-		zap_unlockdir(zap, FTAG);
+		zap_unlock(zap, FTAG);
 	} else {
 		dmu_buf_rele(db, FTAG);
 	}

From d3523f9093984f9678f86c5974ff0645d5c72494 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 7 May 2026 13:42:28 +1000
Subject: [PATCH 041/129] zap_lock: make it be a simple wrapper around
 zap_lock_by_dnode()

The only real difference between zap_lock() and zap_lock_by_dnode() is
that the former takes and releases its own dnode hold. If we make it
just delegate to zap_lock_by_dnode(), then the dbuf hold and release can
be handled there, in one place.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
---
 module/zfs/zap_impl.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index f50b77e591a..f70e858ddf4 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -413,22 +413,13 @@ zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     zap_t **zapp)
 {
 	dnode_t *dn;
-	dmu_buf_t *db;
 	int err;
 
 	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
-	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
-		dnode_rele(dn, tag);
-		return (err);
-	}
-	err = zap_lock_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
-		dmu_buf_rele(db, tag);
-		dnode_rele(dn, tag);
-	}
+	err = zap_lock_by_dnode(dn, tx, lti, fatreader, adding, tag, zapp);
+	dnode_rele(dn, tag);
 	return (err);
 }
 

From 18d910bd2ce0434b3374e25d590dcd4d40687971 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 13 May 2026 15:38:27 +1000
Subject: [PATCH 042/129] mzap_create_impl: use zap_lock_by_dnode()

The only reason this used zap_lock_impl() directly was to avoid an extra
dbuf hold, but there's no real reason to do that. Just use
zap_lock_by_dnode(), and then zap_lock_impl() can be de-exported.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
---
 include/sys/zap_impl.h | 4 ----
 module/zfs/zap_impl.c  | 2 +-
 module/zfs/zap_micro.c | 9 ++++-----
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 85f9e5d8979..d985a5a0294 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -255,10 +255,6 @@ int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp);
 
-/* Underlying implementation for above; do not use. */
-int zap_lock_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
-
 /* Unlock and release a zap_t. */
 void zap_unlock(zap_t *zap, const void *tag);
 
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index f70e858ddf4..c05985c0adb 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -289,7 +289,7 @@ zap_hash(zap_name_t *zn)
  * This routine "consumes" the caller's hold on the dbuf, which must
  * have the specified tag.
  */
-int
+static int
 zap_lock_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 1f98723f388..04956b005c8 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -405,14 +405,13 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY(dnode_add_ref(dn, FTAG));
-		VERIFY0(zap_lock_impl(dn, db, FTAG, tx, RW_WRITER,
-		    B_FALSE, B_FALSE, &zap));
+		VERIFY0(zap_lock_by_dnode(dn, tx,
+		    RW_WRITER, B_FALSE, B_FALSE, FTAG, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlock(zap, FTAG);
-	} else {
-		dmu_buf_rele(db, FTAG);
 	}
+
+	dmu_buf_rele(db, FTAG);
 }
 
 /*

From c8f9b4c4da44e7ef19be91510f36b8f7acdab4aa Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 21:36:07 +1000
Subject: [PATCH 043/129] zap: lift and simplify zap_t lock upgrade

Most fatzap write ops only take the READER zap_t lock, because the
header block only needs to be updated when a change would add or remove
a leaf block or spill the ptrtbl. When this happens, the lock is
upgraded to WRITER so those changes can be made.

If the lock can't be upgraded directly (not least because
rw_tryupgrade() is a no-op on Linux and userspace), then it has to be
dropped and re-acquired, that is, zap_unlock() and then zap_lock().

However, this method is far heavier than it needs to be, and adds
complication because it fully releases the zap_t, the header dbuf and
the dnode. This gives a window where the dbuf can be evicted and so the
zap_t destroyed. In addition to the IO overhead if this happens, this
means the zap_t returned by zap_lock() may be different to the original,
which means all callers need to be prepared for it to change.

zap_shrink() used an alternate method of simply dropping and reacquiring
zap_rwlock rather than fully destroying everything. The comment there
says it was only done because of lack of a refcount tag for unlock/lock,
but this is actually a better general technique, as the zap_t is
guaranteed to remain alive because its owning dbuf is never released and
so can will not be evicted.

So, this commit lifts the old zap_tryupgradedir() to
zap_lock_try_upgrade(), and adds a potentially-blocking variant
zap_lock_upgrade() that drops and retakes the rwlock. Everything is
switched to use them, which vastly simplifies the surrounding code.
Because the zap_t, dbuf and dnode are never dropped, there's no way for
the upgrade operation to fail, and so the callers never have to deal
with the zap_t changing under them.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
---
 include/sys/zap_impl.h | 12 +++++++++
 module/zfs/zap.c       | 18 +++-----------
 module/zfs/zap_fat.c   | 55 +++++++-----------------------------------
 module/zfs/zap_impl.c  | 37 ++++++++++++++++++++++++++++
 module/zfs/zap_micro.c |  1 -
 5 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index d985a5a0294..b4f30405eba 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -258,6 +258,18 @@ int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
 /* Unlock and release a zap_t. */
 void zap_unlock(zap_t *zap, const void *tag);
 
+/*
+ * Try to upgrade a zap lock from READER to WRITER. If the upgrade is not
+ * possible without blocking, returns 0. If the upgrade happened, returns 1.
+ */
+int zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+/*
+ * Upgrade a zap lock from READER to WRITER. If it can't be upgraded
+ * immediately it will block.
+ */
+void zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx);
+
 /* zap_t release function for when associated dbuf is evicted. */
 void zap_evict_sync(void *dbu);
 
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index f319469e55e..8e3cf7e1c47 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -501,7 +501,6 @@ zap_add_impl(zap_t *zap, const char *key,
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN ||
 	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
@@ -510,7 +509,6 @@ zap_add_impl(zap_t *zap, const char *key,
 			err = fzap_add(zn, integer_size, num_integers, val,
 			    tag, tx);
 		}
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		zfs_btree_index_t idx;
 		if (mze_find(zn, &idx) != NULL) {
@@ -521,8 +519,7 @@ zap_add_impl(zap_t *zap, const char *key,
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlock(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -573,10 +570,8 @@ zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlock(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -635,7 +630,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	if (!zap->zap_ismicro) {
 		err = fzap_update(zn, integer_size, num_integers, val,
 		    FTAG, tx);
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
@@ -646,7 +640,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, FTAG, tx);
 		}
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t *mze = mze_find(zn, &idx);
@@ -658,8 +651,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlock(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -678,10 +670,8 @@ zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
-	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlock(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c
index 2d068f64671..6cf773e786e 100644
--- a/module/zfs/zap_fat.c
+++ b/module/zfs/zap_fat.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
  * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2026, TrueNAS.
  */
 
 /*
@@ -157,18 +158,6 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	dmu_buf_rele(db, FTAG);
 }
 
-static int
-zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
-{
-	if (RW_WRITE_HELD(&zap->zap_rwlock))
-		return (1);
-	if (rw_tryupgrade(&zap->zap_rwlock)) {
-		dmu_buf_will_dirty(zap->zap_dbuf, tx);
-		return (1);
-	}
-	return (0);
-}
-
 /*
  * Generic routines for dealing with the pointer & cookie tables.
  */
@@ -711,6 +700,7 @@ static int
 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
     const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
+	(void) tag;
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
 	int err;
@@ -722,21 +712,13 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-	if (zap_tryupgradedir(zap, tx) == 0 ||
+	if (zap_lock_try_upgrade(zap, tx) == 0 ||
 	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 		/* We failed to upgrade, or need to grow the pointer table */
-		objset_t *os = zap->zap_objset;
-		uint64_t object = zap->zap_object;
-
 		zap_put_leaf(l);
 		*lp = l = NULL;
-		zap_unlock(zap, tag);
-		err = zap_lock(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, tag, &zn->zn_zap);
-		zap = zn->zn_zap;
-		if (err != 0)
-			return (err);
-		ASSERT(!zap->zap_ismicro);
+
+		zap_lock_upgrade(zap, tx);
 
 		while (old_prefix_len ==
 		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
@@ -801,6 +783,7 @@ static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
     const void *tag, dmu_tx_t *tx)
 {
+	(void) tag;
 	zap_t *zap = zn->zn_zap;
 	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
@@ -813,17 +796,7 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
 		 * We are in the middle of growing the pointer table, or
 		 * this leaf will soon make us grow it.
 		 */
-		if (zap_tryupgradedir(zap, tx) == 0) {
-			objset_t *os = zap->zap_objset;
-			uint64_t zapobj = zap->zap_object;
-
-			zap_unlock(zap, tag);
-			int err = zap_lock(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
-			zap = zn->zn_zap;
-			if (err != 0)
-				return;
-		}
+		zap_lock_upgrade(zap, tx);
 
 		/* could have finished growing while our locks were down */
 		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
@@ -946,7 +919,6 @@ fzap_add_cd(zap_name_t *zn,
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
@@ -1009,7 +981,6 @@ fzap_update(zap_name_t *zn,
 
 	if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
@@ -1392,22 +1363,14 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 		 * If there two empty sibling, we have work to do, so
 		 * we need to lock ZAP ptrtbl as WRITER.
 		 */
-		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+		if (!writer && (writer = zap_lock_try_upgrade(zap, tx)) == 0) {
 			/* We failed to upgrade */
 			if (l != NULL) {
 				zap_put_leaf(l);
 				l = NULL;
 			}
 
-			/*
-			 * Usually, the right way to upgrade from a READER lock
-			 * to a WRITER lock is to call zap_unlock() and
-			 * zap_lock(), but we do not have a tag. Instead,
-			 * we do it in more sophisticated way.
-			 */
-			rw_exit(&zap->zap_rwlock);
-			rw_enter(&zap->zap_rwlock, RW_WRITER);
-			dmu_buf_will_dirty(zap->zap_dbuf, tx);
+			zap_lock_upgrade(zap, tx);
 
 			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 			writer = B_TRUE;
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index c05985c0adb..b72aa82349e 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -431,6 +431,43 @@ zap_unlock(zap_t *zap, const void *tag)
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
+int
+zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	if (RW_WRITE_HELD(&zap->zap_rwlock))
+		/* Already have writer, nothing to do. */
+		return (1);
+
+	/* Try to upgrade the lock in-place. */
+	if (rw_tryupgrade(&zap->zap_rwlock)) {
+		/*
+		 * Got it, mark buffer dirty, since we only do that in
+		 * zap_lock_impl() for writer.
+		 */
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+		return (1);
+	}
+
+	return (0);
+}
+
+void
+zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	if (zap_lock_try_upgrade(zap, tx))
+		return;
+
+	/*
+	 * It's safe to drop the lock here because we still have a hold on
+	 * zap_dbuf, which prevents the dbuf being evicted and the zap_t being
+	 * deallocated.
+	 */
+	rw_exit(&zap->zap_rwlock);
+
+	rw_enter(&zap->zap_rwlock, RW_WRITER);
+	dmu_buf_will_dirty(zap->zap_dbuf, tx);
+}
+
 void
 zap_evict_sync(void *dbu)
 {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 04956b005c8..727f72d999f 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -363,7 +363,6 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
 		/* If we fail here, we would end up losing entries */
 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
 		    tag, tx));
-		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 	}
 	zap_name_free(zn);
 	vmem_free(mzp, sz);

From 2f283c99cc8cc5351b0d8e8aee833ac1e401e808 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 20:06:23 +1000
Subject: [PATCH 044/129] zap: remove refcount tags from backend functions

Since we now never need to unlock/lock an existing zap_t, we don't need
to thread through the refcount tag everywhere, which lets us simplify a
lot of calls.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
---
 include/sys/zap_impl.h | 15 ++++++---------
 module/zfs/zap.c       | 18 ++++++++----------
 module/zfs/zap_fat.c   | 33 +++++++++++++--------------------
 module/zfs/zap_impl.c  | 10 +++-------
 module/zfs/zap_micro.c |  6 +++---
 5 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index b4f30405eba..6c72cd977c8 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -280,8 +280,7 @@ uint64_t zap_getflags(zap_t *zap);
 
 /* Microzap implementation. */
 zap_t *mzap_open(dmu_buf_t *db);
-int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx,
-    zap_flags_t flags);
+int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
 mzap_ent_t *mze_find(zap_name_t *zn, zfs_btree_index_t *idx);
 boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash);
 void mze_destroy(zap_t *zap);
@@ -300,19 +299,17 @@ int fzap_lookup(zap_name_t *zn,
     uint64_t *actual_num_integers);
 void fzap_prefetch(zap_name_t *zn);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
-    const void *val, const void *tag, dmu_tx_t *tx);
-int fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val,
-    const void *tag, dmu_tx_t *tx);
+    const void *val, dmu_tx_t *tx);
+int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 int fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers);
 int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
 int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
 void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
 void zap_put_leaf(struct zap_leaf *l);
-int fzap_add_cd(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx);
+int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 
 #ifdef	__cplusplus
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 8e3cf7e1c47..f4575946939 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -500,14 +500,13 @@ zap_add_impl(zap_t *zap, const char *key,
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+		err = fzap_add(zn, integer_size, num_integers, val, tx);
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN ||
 	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
-		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
+		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0) {
-			err = fzap_add(zn, integer_size, num_integers, val,
-			    tag, tx);
+			err = fzap_add(zn, integer_size, num_integers, val, tx);
 		}
 	} else {
 		zfs_btree_index_t idx;
@@ -569,7 +568,7 @@ zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
 		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	err = fzap_add(zn, integer_size, num_integers, val, tx);
 	zap_name_free(zn);
 	zap_unlock(zap, tag);
 	return (err);
@@ -628,17 +627,16 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_update(zn, integer_size, num_integers, val,
-		    FTAG, tx);
+		err = fzap_update(zn, integer_size, num_integers, val, tx);
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    (u_longlong_t)zapobj, integer_size,
 		    (u_longlong_t)num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0) {
 			err = fzap_update(zn, integer_size, num_integers,
-			    val, FTAG, tx);
+			    val, tx);
 		}
 	} else {
 		zfs_btree_index_t idx;
@@ -669,7 +667,7 @@ zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
 		zap_unlock(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, tx);
 	zap_name_free(zn);
 	zap_unlock(zap, tag);
 	return (err);
diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c
index 6cf773e786e..7b48c6fd5a1 100644
--- a/module/zfs/zap_fat.c
+++ b/module/zfs/zap_fat.c
@@ -697,10 +697,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 }
 
 static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
-    const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 {
-	(void) tag;
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
 	int err;
@@ -780,10 +778,8 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 }
 
 static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
-    const void *tag, dmu_tx_t *tx)
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 {
-	(void) tag;
 	zap_t *zap = zn->zn_zap;
 	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
@@ -887,9 +883,8 @@ fzap_lookup(zap_name_t *zn,
 }
 
 int
-fzap_add_cd(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
+fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@@ -918,7 +913,7 @@ fzap_add_cd(zap_name_t *zn,
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tag, tx, &l);
+		err = zap_expand_leaf(zn, l, tx, &l);
 		if (err == 0)
 			goto retry;
 	}
@@ -928,28 +923,26 @@ fzap_add_cd(zap_name_t *zn,
 		if (err == ENOSPC)
 			zap_put_leaf(l);
 		else
-			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 	}
 	return (err);
 }
 
 int
-fzap_add(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, const void *tag, dmu_tx_t *tx)
+fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_NEED_CD, tag, tx));
+	    val, ZAP_NEED_CD, tx));
 }
 
 int
-fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val,
-    const void *tag, dmu_tx_t *tx)
+fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@@ -980,7 +973,7 @@ fzap_update(zap_name_t *zn,
 	}
 
 	if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tag, tx, &l);
+		err = zap_expand_leaf(zn, l, tx, &l);
 		if (err == 0)
 			goto retry;
 	}
@@ -989,7 +982,7 @@ fzap_update(zap_name_t *zn,
 		if (err == ENOSPC)
 			zap_put_leaf(l);
 		else
-			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 	}
 	return (err);
 }
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index b72aa82349e..0c2ba1cdbfe 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -285,12 +285,8 @@ zap_hash(zap_name_t *zn)
 	return (h);
 }
 
-/*
- * This routine "consumes" the caller's hold on the dbuf, which must
- * have the specified tag.
- */
 static int
-zap_lock_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+zap_lock_impl(dnode_t *dn, dmu_buf_t *db, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -349,7 +345,7 @@ zap_lock_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
-			int err = mzap_upgrade(zapp, tag, tx, 0);
+			int err = mzap_upgrade(zapp, tx, 0);
 			if (err != 0)
 				rw_exit(&zap->zap_rwlock);
 			return (err);
@@ -399,7 +395,7 @@ zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
-	err = zap_lock_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	err = zap_lock_impl(dn, db, tx, lti, fatreader, adding, zapp);
 	if (err != 0)
 		dmu_buf_rele(db, tag);
 	else
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 727f72d999f..a7c9c9c03b4 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -324,7 +324,7 @@ mzap_open(dmu_buf_t *db)
 }
 
 int
-mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 {
 	int err = 0;
 	zap_t *zap = *zapp;
@@ -362,7 +362,7 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
 		zap_name_init_str(zn, mze->mze_name, 0);
 		/* If we fail here, we would end up losing entries */
 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
-		    tag, tx));
+		    tx));
 	}
 	zap_name_free(zn);
 	vmem_free(mzp, sz);
@@ -406,7 +406,7 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY0(zap_lock_by_dnode(dn, tx,
 		    RW_WRITER, B_FALSE, B_FALSE, FTAG, &zap));
-		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
+		VERIFY0(mzap_upgrade(&zap, tx, flags));
 		zap_unlock(zap, FTAG);
 	}
 

From 6fb72fda0f60d9efb591e320f83f78b19ec451cc Mon Sep 17 00:00:00 2001
From: Saju Palayur <saju.palayur@gmail.com>
Date: Fri, 15 May 2026 14:15:05 -0700
Subject: [PATCH 045/129] zio_ddt_write: compute have_dvas after taking
 dde_io_lock

In zio_ddt_write(), have_dvas and is_ganged were computed before
dde_io_lock was taken. A concurrent zio_ddt_child_write_done() error
path calls ddt_phys_unextend() under dde_io_lock, which can zero
DVA[0] while another thread is between computing have_dvas and taking
dde_io_lock. That thread then uses the stale have_dvas=1 to call
ddt_bp_fill(), copying the zeroed DVA into the BP. A zero DVA resolves
as a hole, producing blocks that read back as zeros with no checksum
error (silent data corruption).

Fix by moving have_dvas and is_ganged computation to after dde_io_lock
is taken, so they always reflect the current state of dde->dde_phys.

Regression introduced by a41ef36858 ("DDT: Reduce global DDT lock
scope during writes").

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Saju Palayur <spalayur@maxlinear.com>
Signed-off-by: Saju Palayur <spalayur@maxlinear.com>
Closes #18366
Closes #18544
---
 module/zfs/zio.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 3e95103385c..4b7c13dd1e9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3830,7 +3830,6 @@ zio_ddt_write(zio_t *zio)
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
-	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	/*
 	 * In the common cases, at this point we have a regular BP with no
@@ -3861,14 +3860,6 @@ zio_ddt_write(zio_t *zio)
 	 * end of the chain and letting the sequence play out.
 	 */
 
-	/*
-	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
-	 * the third one as normal.
-	 */
-	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
-	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
-	boolean_t is_ganged = ddt_phys_is_gang(ddp, v);
-
 	/* Number of DVAs requested by the IO. */
 	uint8_t need_dvas = zp->zp_copies;
 	/* Number of DVAs in outstanding writes for this dde. */
@@ -3883,6 +3874,21 @@ zio_ddt_write(zio_t *zio)
 	if (dde_io != NULL)
 		mutex_enter(&dde_io->dde_io_lock);
 
+	/*
+	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
+	 * the third one as normal.
+	 *
+	 * Must be computed after taking dde_io_lock (if held) to avoid
+	 * racing with ddt_phys_unextend() in zio_ddt_child_write_done()
+	 * error path, which can zero DVAs under dde_io_lock. Without the
+	 * lock, a stale have_dvas causes ddt_bp_fill() to copy a zeroed
+	 * DVA into the BP, producing a hole that reads back as zeros.
+	 */
+	ddt_univ_phys_t *ddp = dde->dde_phys;
+	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
+	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
+	boolean_t is_ganged = ddt_phys_is_gang(ddp, v);
+
 	if (dde_io == NULL || dde_io->dde_lead_zio[p] == NULL) {
 		/*
 		 * No IO outstanding, so we only need to worry about ourselves.

From 40a87651d49aadd4370c5a87590651d6e11e498c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Mon, 18 May 2026 05:13:59 +1000
Subject: [PATCH 046/129] zap_impl: use flex array field for
 mzap_phys_t.mz_chunks

mz_phys_t is always a full-block allocation, with mz_chunks[] as an
array over the rest of the block past the header.

Recent Linux compiled with CONFIG_UBSAN will complain about this:

    UBSAN: array-index-out-of-bounds in module/zfs/zap.c:1236:28
    index 2 is out of range for type 'mzap_ent_phys_t [1]'

The fix is straightforward; simply convert this field to a flex member.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18550
---
 include/sys/zap_impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 6c72cd977c8..ea8963f550f 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -62,8 +62,9 @@ typedef struct mzap_phys {
 	uint64_t mz_salt;
 	uint64_t mz_normflags;
 	uint64_t mz_pad[5];
-	mzap_ent_phys_t mz_chunk[1];
+
 	/* actually variable size depending on block size */
+	mzap_ent_phys_t mz_chunk[];
 } mzap_phys_t;
 
 typedef struct mzap_ent {

From 891e379d0ff2e7285b009a0bdb108feb642daa98 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Date: Mon, 18 May 2026 09:12:09 -0700
Subject: [PATCH 047/129] Fix failfast default and usage

The feature that added a failfast property to vdevs unfortunately did
not correctly set the default at creation time, so many vdevs do not
actually have the property set. In addition, when the property is
used, the failfast flag is not checked correctly, resulting in the
feature mostly not working as intended.

Set the failfast property to the default value at vdev allocation time.
The value will be read in from the ZAP as normal when the vdev metadata
is loaded.  Allow the property to be set on any vdev and have it be
inherited from the root or top-level vdev.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Closes #18410
---
 include/sys/fs/zfs.h                          |   1 +
 man/man7/vdevprops.7                          |   5 +-
 module/os/linux/zfs/vdev_disk.c               |   8 +-
 module/zcommon/zpool_prop.c                   |  12 +-
 module/zfs/vdev.c                             |  30 +++--
 tests/runfiles/common.run                     |   4 +-
 tests/runfiles/sanity.run                     |   2 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../cli_root/zpool_set/zpool_set_inherit.ksh  | 115 ++++++++++++++++++
 9 files changed, 161 insertions(+), 17 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh

diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index d9b6e7654b0..4c4d15f8ce0 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -363,6 +363,7 @@ typedef enum {
 /* Small enough to not hog a whole line of printout in zpool(8). */
 #define	ZPROP_MAX_COMMENT	32
 #define	ZPROP_BOOLEAN_NA	2
+#define	ZPROP_BOOLEAN_INHERIT	2
 
 #define	ZPROP_VALUE		"value"
 #define	ZPROP_SOURCE		"source"
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 5f5e10723c1..da38acafeee 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -183,9 +183,12 @@ output.
 A text comment up to 8192 characters long
 .It Sy bootsize
 The amount of space to reserve for the EFI system partition
-.It Sy failfast
+.It Sy failfast Ns = Ns Sy inherit Ns | Ns Sy on Ns | Ns Sy off
 If this device should propagate BIO errors back to ZFS, used to disable
 failfast.
+.Sy inherit
+causes the vdev to adopt the behavior of its parent vdev,
+recursively up the tree.
 .It Sy sit_out
 Only valid for
 .Sy RAIDZ
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 66e10584ab5..7cc19fe5afb 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -931,8 +931,14 @@ vdev_disk_io_rw(zio_t *zio)
 		return (SET_ERROR(EIO));
 	}
 
+	vdev_t *iter = v;
+	while (iter != NULL && iter->vdev_failfast == ZPROP_BOOLEAN_INHERIT)
+		iter = iter->vdev_parent;
+
+	boolean_t failfast = iter ? iter->vdev_failfast == 1 :
+	    vdev_prop_default_numeric(VDEV_PROP_FAILFAST);
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
-	    v->vdev_failfast == B_TRUE) {
+	    failfast) {
 		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
 		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
 	}
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index 13a1390d1e1..ccd9f3854f5 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -374,10 +374,16 @@ vdev_prop_init(void)
 		{ "on",		1},
 		{ NULL }
 	};
+	static const zprop_index_t boolean_inherit_table[] = {
+		{ "off",	0},
+		{ "on",		1},
+		{ "inherit",	ZPROP_BOOLEAN_INHERIT},
+		{ NULL }
+	};
 	static const zprop_index_t boolean_na_table[] = {
 		{ "off",	0},
 		{ "on",		1},
-		{ "-",		2},	/* ZPROP_BOOLEAN_NA */
+		{ "-",		ZPROP_BOOLEAN_NA},
 		{ NULL }
 	};
 
@@ -555,8 +561,8 @@ vdev_prop_init(void)
 
 	/* default index properties */
 	zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
-	    PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "FAILFAST", boolean_table,
-	    sfeatures);
+	    PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off | inherit", "FAILFAST",
+	    boolean_inherit_table, sfeatures);
 	zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events",
 	    B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off",
 	    "SLOW_IO_EVENTS", boolean_table, sfeatures);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 9f083cd510f..e4dc9e97af7 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1117,6 +1117,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
 		vd->vdev_autosit =
 		    vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
+	if (ops == &vdev_root_ops)
+		vd->vdev_failfast =
+		    vdev_prop_default_numeric(VDEV_PROP_FAILFAST);
+	else
+		vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT;
 
 	/*
 	 * Add ourselves to the parent's list of children.
@@ -3912,10 +3917,9 @@ vdev_load(vdev_t *vd)
 		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
 		    1, &failfast);
 		if (error == 0) {
-			vd->vdev_failfast = failfast & 1;
+			vd->vdev_failfast = failfast;
 		} else if (error == ENOENT) {
-			vd->vdev_failfast = vdev_prop_default_numeric(
-			    VDEV_PROP_FAILFAST);
+			vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT;
 		} else {
 			vdev_dbgmsg(vd,
 			    "vdev_load: zap_lookup(top_zap=%llu) "
@@ -6230,11 +6234,14 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 				error = spa_vdev_alloc(spa, vdev_guid);
 			break;
 		case VDEV_PROP_FAILFAST:
-			if (nvpair_value_uint64(elem, &intval) != 0) {
+			if (nvpair_value_uint64(elem, &intval) != 0 ||
+			    intval > ZPROP_BOOLEAN_INHERIT ||
+			    (intval == ZPROP_BOOLEAN_INHERIT &&
+			    vd->vdev_ops == &vdev_root_ops)) {
 				error = EINVAL;
 				break;
 			}
-			vd->vdev_failfast = intval & 1;
+			vd->vdev_failfast = intval;
 			break;
 		case VDEV_PROP_SIT_OUT:
 			/* Only expose this for a draid or raidz leaf */
@@ -6764,18 +6771,23 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 				break;
 			case VDEV_PROP_FAILFAST:
 				src = ZPROP_SRC_LOCAL;
-				strval = NULL;
 
 				err = zap_lookup(mos, objid, nvpair_name(elem),
 				    sizeof (uint64_t), 1, &intval);
 				if (err == ENOENT) {
-					intval = vdev_prop_default_numeric(
-					    prop);
+					if (vd->vdev_ops == &vdev_root_ops)
+						intval =
+						    vdev_prop_default_numeric(
+						    prop);
+					else
+						intval = ZPROP_BOOLEAN_INHERIT;
 					err = 0;
 				} else if (err) {
 					break;
 				}
-				if (intval == vdev_prop_default_numeric(prop))
+				if (intval == ZPROP_BOOLEAN_INHERIT ||
+				    (vd->vdev_ops == &vdev_root_ops &&
+				    intval == 1))
 					src = ZPROP_SRC_DEFAULT;
 
 				vdev_prop_add_list(outnvl, propname, strval,
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 003e1c35495..82a2d1e815e 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -573,8 +573,8 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
-    'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
-    'user_property_001_pos', 'user_property_002_neg',
+    'zpool_set_ashift', 'zpool_set_features', 'zpool_set_inherit',
+    'vdev_set_001_pos', 'user_property_001_pos', 'user_property_002_neg',
     'zpool_set_clear_userprop','vdev_set_scheduler']
 tags = ['functional', 'cli_root', 'zpool_set']
 
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index 936f2bcc32b..d62f8f3fb16 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -353,7 +353,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
-    'zpool_set_ashift', 'zpool_set_features']
+    'zpool_set_ashift', 'zpool_set_features', 'zpool_set_inherit']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 75b53c6ddd0..85f00f28b0f 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1303,6 +1303,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_003_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_ashift.ksh \
+	functional/cli_root/zpool_set/zpool_set_inherit.ksh \
 	functional/cli_root/zpool_set/user_property_001_pos.ksh \
 	functional/cli_root/zpool_set/user_property_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_features.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh
new file mode 100755
index 00000000000..2694e3278d9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh
@@ -0,0 +1,115 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2026, Klara, Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+#
+# zpool set can set the failfast property to 'inherit'
+#
+# STRATEGY:
+# 1. Create a pool
+# 2. Verify that we can set 'failfast' to various values, including inherit
+# 3. Verify that the root vdev cannot be set to inherit
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL1
+	rm -f $FILEVDEV1 $FILEVDEV2 $FILEVDEV3
+}
+
+function get_failfast
+{
+	zpool get -H -o value failfast $TESTPOOL1 $@
+}
+
+log_onexit cleanup
+
+log_assert "zpool set can configure 'failfast' property to inherit"
+FILEVDEV1="$TEST_BASE_DIR/zpool_set_inherit1.$$.dat"
+FILEVDEV2="$TEST_BASE_DIR/zpool_set_inherit2.$$.dat"
+FILEVDEV3="$TEST_BASE_DIR/zpool_set_inherit3.$$.dat"
+
+log_must truncate -s $MINVDEVSIZE $FILEVDEV1
+log_must truncate -s $MINVDEVSIZE $FILEVDEV2
+log_must truncate -s $MINVDEVSIZE $FILEVDEV3
+
+log_must zpool create -f $TESTPOOL1 $FILEVDEV1 mirror $FILEVDEV2 $FILEVDEV3
+failfast=$(get_failfast $FILEVDEV1)
+[[ "$failfast" == "inherit" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=on $TESTPOOL1 $FILEVDEV1
+failfast=$(get_failfast $FILEVDEV1)
+[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=off $TESTPOOL1 $FILEVDEV1
+failfast=$(get_failfast $FILEVDEV1)
+[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=inherit $TESTPOOL1 $FILEVDEV1
+
+failfast=$(get_failfast $FILEVDEV2)
+[[ "$failfast" == "inherit" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=on $TESTPOOL1 $FILEVDEV2
+failfast=$(get_failfast $FILEVDEV2)
+[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=off $TESTPOOL1 $FILEVDEV2
+failfast=$(get_failfast $FILEVDEV2)
+[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=inherit $TESTPOOL1 $FILEVDEV2
+
+failfast=$(get_failfast mirror-1)
+[[ "$failfast" == "inherit" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=on $TESTPOOL1 mirror-1
+failfast=$(get_failfast mirror-1)
+[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=off $TESTPOOL1 mirror-1
+failfast=$(get_failfast mirror-1)
+[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=inherit $TESTPOOL1 mirror-1
+
+failfast=$(get_failfast root)
+[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_must zpool set failfast=off $TESTPOOL1 root
+failfast=$(get_failfast root)
+[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast"
+
+log_mustnot zpool set failfast=inherit $TESTPOOL1 root
+
+
+log_pass "zpool set can configure 'failfast' property to inherit"

From bd02c10b00e134a4640fbc57741ab277ef9659cc Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 11:55:01 +1000
Subject: [PATCH 048/129] zap: make the _by_dnode() op variants be the primary
 implementation

The existing pattern for each operation is to have a "frontend" function
that takes an object referenced by either a objset+object pair (eg
zap_add()) or an existing dnode (eg zap_add_by_dnode()). Those functions
obtain a locked zap_t for the given object from either zap_lockdir() or
zap_lockdir_by_dnode(). That zap_t, the operation args, and the refcount
tag for lockdir() are then passed through to through to the "backend"
function (eg zap_add()), which does the work and then releases calls
zap_unlockdir() to release the zap_t.

This pattern is overcomplicated, in at least three ways:

- Both frontends for each operation have to make the call to
  zap_lockdir(), which has multiple args that must be the same for both.

- Frontends need to pass the refcount tag to the backend so it can
  call zap_unlockdir() correctly, which makes the signature more
  complicated.

- The only difference between the frontend functions is whether they
  call either zap_lockdir() or zap_lockdir_by_dnode(), and the only real
  difference between those is that the objset+object version takes a
  dnode hold first.

All of this makes the code very repetitive and difficult to read (and
thus to modify).

This commits addresses all of the above by having the _impl() function
take a dnode_t, rather than a zap_t. This allows zap_lockdir_by_dnode() to
be called in all cases from inside the _impl() function, so it only
needs to be specified in one place.

Then, because the lock and unlock are now done inside the same function,
there's no need for a separate tag arg - we can just use FTAG.

This results in the _by_dnode() functions being just direct calls and
returns to the _impl() functions, and so allows them to be removed
entirely, and _impl() to be renamed as _by_dnode().

Finally, the objset+object functions are simple mechanical wrappers
around dnode_hold(), _by_dnode(), dnode_rele().

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18551
---
 module/zfs/zap.c | 429 +++++++++++++++++------------------------------
 1 file changed, 156 insertions(+), 273 deletions(-)

diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index f4575946939..d7eded59eaf 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -220,17 +220,24 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 
 /* zap_lookup */
 
-static int
-zap_lookup_impl(zap_t *zap, const char *name,
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
-	int err = 0;
+	zap_t *zap;
+
+	int err =
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
 
 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
-	if (zn == NULL)
+	if (zn == NULL) {
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
+	}
 
 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
@@ -260,6 +267,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
 		}
 	}
 	zap_name_free(zn);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -285,105 +293,64 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
-	zap_t *zap;
-
-	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
+	err = zap_lookup_norm_by_dnode(dn, name, integer_size,
 	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlock(zap, FTAG);
-	return (err);
-}
-
-int
-zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	zap_t *zap;
-
-	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
-	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlock(zap, FTAG);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /* zap_lookup_uint64 */
 
-static int
-zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
-    uint64_t *actual_num_integers, const void *tag)
-{
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlock(zap, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	int err = fzap_lookup(zn, integer_size, num_integers, buf,
-	    NULL, 0, NULL, actual_num_integers);
-	zap_name_free(zn);
-	zap_unlock(zap, tag);
-	return (err);
-}
-
-int
-zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, NULL, FTAG);
-	/* zap_lookup_length_uint64_impl() calls zap_unlock() */
-	return (err);
-}
-
-int
-zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, NULL, FTAG);
-	/* zap_lookup_length_uint64_impl() calls zap_unlock() */
-	return (err);
-}
-
 int
 zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
     uint64_t *actual_num_integers)
 {
 	zap_t *zap;
-
 	int err =
 	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	err = zap_lookup_length_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, buf, actual_num_integers, FTAG);
-	/* zap_lookup_length_uint64_impl() calls zap_unlock() */
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlock(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	err = fzap_lookup(zn, integer_size, num_integers, buf,
+	    NULL, 0, NULL, actual_num_integers);
+	zap_name_free(zn);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_length_uint64_by_dnode(dn, key, key_numints,
+	    integer_size, num_integers, buf, NULL);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_length_uint64_by_dnode(dn, key, key_numints,
+	    integer_size, num_integers, buf, NULL));
+}
+
 /* zap_contains */
 
 int
@@ -422,19 +389,24 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 
 /* zap_prefetch_uint64 */
 
-static int
-zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
-    const void *tag)
+int
+zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
 {
+	zap_t *zap;
+	int err =
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlock(zap, tag);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
-	zap_unlock(zap, tag);
+	zap_unlock(zap, FTAG);
 	return (0);
 }
 
@@ -442,28 +414,12 @@ int
 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
-	zap_t *zap;
-
-	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = zap_prefetch_uint64_impl(zap, key, key_numints, FTAG);
-	/* zap_prefetch_uint64_impl() calls zap_unlock() */
-	return (err);
-}
-
-int
-zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_prefetch_uint64_impl(zap, key, key_numints, FTAG);
-	/* zap_prefetch_uint64_impl() calls zap_unlock() */
+	err = zap_prefetch_uint64_by_dnode(dn, key, key_numints);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
@@ -486,17 +442,21 @@ zap_prefetch_object(objset_t *os, uint64_t zapobj)
 
 /* zap_add */
 
-static int
-zap_add_impl(zap_t *zap, const char *key,
+int
+zap_add_by_dnode(dnode_t *dn, const char *key,
     int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx, const void *tag)
+    const void *val, dmu_tx_t *tx)
 {
-	const uint64_t *intval = val;
-	int err = 0;
+	zap_t *zap;
+	int err =
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
 
+	const uint64_t *intval = val;
 	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
 	if (zn == NULL) {
-		zap_unlock(zap, tag);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@@ -518,7 +478,7 @@ zap_add_impl(zap_t *zap, const char *key,
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
-	zap_unlock(zap, tag);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -527,50 +487,38 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
-	zap_t *zap;
-	int err;
-
-	err = zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlock() */
-	return (err);
-}
-
-int
-zap_add_by_dnode(dnode_t *dn, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlock() */
+	err = zap_add_by_dnode(dn, key, integer_size, num_integers, val, tx);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /* zap_add_uint64 */
 
-static int
-zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx, const void *tag)
+    const void *val, dmu_tx_t *tx)
 {
-	int err;
+	zap_t *zap;
+	int err =
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlock(zap, tag);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
-	zap_unlock(zap, tag);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -579,32 +527,13 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
-	zap_t *zap;
-
-	int err =
-	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = zap_add_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_uint64_impl() calls zap_unlock() */
-	return (err);
-}
-
-int
-zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_uint64_impl() calls zap_unlock() */
+	err = zap_add_uint64_by_dnode(dn, key, key_numints,
+	    integer_size, num_integers, val, tx);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
@@ -655,21 +584,24 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 
 /* zap_update_uint64 */
 
-static int
-zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
-    const void *tag)
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
-	int err;
+	zap_t *zap;
+	int err =
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlock(zap, tag);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tx);
 	zap_name_free(zn);
-	zap_unlock(zap, tag);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -678,31 +610,13 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers, const void *val,
     dmu_tx_t *tx)
 {
-	zap_t *zap;
-
-	int err =
-	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = zap_update_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_update_uint64_impl() calls zap_unlock() */
-	return (err);
-}
-
-int
-zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_update_uint64_impl(zap, key, key_numints,
-	    integer_size, num_integers, val, tx, FTAG);
-	/* zap_update_uint64_impl() calls zap_unlock() */
+	err = zap_update_uint64_by_dnode(dn, key, key_numints,
+	    integer_size, num_integers, val, tx);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
@@ -745,13 +659,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 /* zap_length_uint64 */
 
 int
-zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
-
 	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
@@ -766,37 +679,36 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 }
 
 int
-zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 {
-	zap_t *zap;
-
-	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlock(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_length(zn, integer_size, num_integers);
-	zap_name_free(zn);
-	zap_unlock(zap, FTAG);
+	err = zap_length_uint64_by_dnode(dn, key, key_numints,
+	    integer_size, num_integers);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /* zap_remove */
 
 static int
-zap_remove_impl(zap_t *zap, const char *name,
-    matchtype_t mt, dmu_tx_t *tx)
+zap_remove_norm_by_dnode(dnode_t *dn, const char *name, matchtype_t mt,
+    dmu_tx_t *tx)
 {
-	int err = 0;
+	zap_t *zap;
+	int err =
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
 
 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
-	if (zn == NULL)
+	if (zn == NULL) {
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
+	}
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
@@ -811,6 +723,7 @@ zap_remove_impl(zap_t *zap, const char *name,
 		}
 	}
 	zap_name_free(zn);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -823,48 +736,42 @@ zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 int
 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 {
-	zap_t *zap;
-	int err;
-
-	err = zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	err = zap_remove_impl(zap, name, 0, tx);
-	zap_unlock(zap, FTAG);
-	return (err);
+	return (zap_remove_norm_by_dnode(dn, name, 0, tx));
 }
 
 int
 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
 {
-	zap_t *zap;
-	int err;
-
-	err = zap_lock(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
 		return (err);
-	err = zap_remove_impl(zap, name, mt, tx);
-	zap_unlock(zap, FTAG);
+	err = zap_remove_norm_by_dnode(dn, name, mt, tx);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /* zap_remove_uint64 */
 
-static int
-zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
-    dmu_tx_t *tx, const void *tag)
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
 {
-	int err;
+	zap_t *zap;
+	int err =
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlock(zap, tag);
+		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
-	zap_unlock(zap, tag);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -872,41 +779,23 @@ int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
 {
-	zap_t *zap;
-
-	int err =
-	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
-	/* zap_remove_uint64_impl() calls zap_unlock() */
-	return (err);
-}
-
-int
-zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
-    dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
-	/* zap_remove_uint64_impl() calls zap_unlock() */
+	err = zap_remove_uint64_by_dnode(dn, key, key_numints, tx);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /* zap_count */
 
 int
-zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+zap_count_by_dnode(dnode_t *dn, uint64_t *count)
 {
 	zap_t *zap;
-
 	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	if (!zap->zap_ismicro) {
@@ -919,20 +808,14 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 }
 
 int
-zap_count_by_dnode(dnode_t *dn, uint64_t *count)
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 {
-	zap_t *zap;
-
-	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_count(zap, count);
-	} else {
-		*count = zap->zap_m.zap_num_entries;
-	}
-	zap_unlock(zap, FTAG);
+	err = zap_count_by_dnode(dn, count);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 

From eed67e40430ebbdb62f38a9f3ffc36544a1ac3f7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 13:32:11 +1000
Subject: [PATCH 049/129] zap: split objset+object implementations to use a
 dnode

For the functions that don't (yet) have _by_dnode() variants, give them
the same treatment as the previous commit - pull their implementation
into a _by_dnode() function, with the original as a simple wrapper.

This lets them all follow the same uniform pattern, and lays the
groundwork for further cleanup in other non-dnode parts of the ZAP
subsystem.

Note that it would be trivial to expose these new _by_dnode() functions,
but there's no need to do that until there's an external need for them.

Also note that there's no change yet to the following, which are not
simple zap_t operations in the same way:

 - zap_contains: wrapper around other ops
 - zap_increment: wrapper around other opts
 - zap_*_int(): wrappers around other ops
 - zap_cursor_*: different lifetime constraints
 - zap_value_search: cursor-based
 - zap_join_*: cursor-based

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18551
---
 module/zfs/zap.c | 95 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 23 deletions(-)

diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index d7eded59eaf..3de985c37cb 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -365,17 +365,16 @@ zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 
 /* zap_prefetch */
 
-int
-zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
+static int
+zap_prefetch_by_dnode(dnode_t *dn, const char *name)
 {
 	zap_t *zap;
-	int err;
-	zap_name_t *zn;
-
-	err = zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	int err =
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
-	zn = zap_name_alloc_str(zap, name, 0);
+
+	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlock(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -387,6 +386,18 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 	return (err);
 }
 
+int
+zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_prefetch_by_dnode(dn, name);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 /* zap_prefetch_uint64 */
 
 int
@@ -539,17 +550,17 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 
 /* zap_update */
 
-int
-zap_update(objset_t *os, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_by_dnode(dnode_t *dn, const char *name, int integer_size,
+    uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
-	const uint64_t *intval = val;
-
 	int err =
-	    zap_lock(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
+
+	const uint64_t *intval = val;
 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlock(zap, FTAG);
@@ -560,7 +571,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    (u_longlong_t)zapobj, integer_size,
+		    (u_longlong_t)dn->dn_object, integer_size,
 		    (u_longlong_t)num_integers, name);
 		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0) {
@@ -582,6 +593,20 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_update_by_dnode(dn, name,
+	    integer_size, num_integers, val, tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 /* zap_update_uint64 */
 
 int
@@ -622,16 +647,16 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 
 /* zap_length */
 
-int
-zap_length(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers)
+static int
+zap_length_by_dnode(dnode_t *dn, const char *name, uint64_t *integer_size,
+    uint64_t *num_integers)
 {
 	zap_t *zap;
-
 	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
+
 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlock(zap, FTAG);
@@ -656,6 +681,19 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_length_by_dnode(dn, name, integer_size, num_integers);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 /* zap_length_uint64 */
 
 int
@@ -1170,13 +1208,12 @@ zap_cursor_serialize(zap_cursor_t *zc)
 
 /* zap_get_stats */
 
-int
-zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+static int
+zap_get_stats_by_dnode(dnode_t *dn, zap_stats_t *zs)
 {
 	zap_t *zap;
-
 	int err =
-	    zap_lock(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	    zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 
@@ -1193,6 +1230,18 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	return (0);
 }
 
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_get_stats_by_dnode(dn, zs);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 EXPORT_SYMBOL(zap_create);
 EXPORT_SYMBOL(zap_create_dnsize);
 EXPORT_SYMBOL(zap_create_norm);

From 3f44da701b5f67baf67687b4a1f869e28b63ef90 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Mon, 18 May 2026 20:27:45 +0200
Subject: [PATCH 050/129] CI: remove FreeBSD 13.5 (EOL April 30, 2026)

FreeBSD 13.5 and stable/13 reached End-of-Life on April 30, 2026 and no
longer receive security support, so they fall outside README.md's stated
support policy.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18553
---
 .github/workflows/scripts/qemu-2-start.sh | 17 -----------------
 .github/workflows/zfs-qemu.yml            |  6 +++---
 README.md                                 |  2 +-
 3 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 7f27eeffed6..5684b3ca2e9 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -88,14 +88,6 @@ case "$OS" in
     OSv="fedora-unknown"
     URL="https://download.fedoraproject.org/pub/fedora/linux/releases/44/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-44-1.7.x86_64.qcow2"
     ;;
-  freebsd13-5r)
-    FreeBSD="13.5-RELEASE"
-    OSNAME="FreeBSD $FreeBSD"
-    OSv="freebsd13.0"
-    URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
-    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
-    NIC="rtl8139"
-    ;;
   freebsd14-4r)
     FreeBSD="14.4-RELEASE"
     OSNAME="FreeBSD $FreeBSD"
@@ -110,14 +102,6 @@ case "$OS" in
     URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
     KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
     ;;
-  freebsd13-5s)
-    FreeBSD="13.5-STABLE"
-    OSNAME="FreeBSD $FreeBSD"
-    OSv="freebsd13.0"
-    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
-    KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
-    NIC="rtl8139"
-    ;;
   freebsd14-4s)
     FreeBSD="14.4-STABLE"
     OSNAME="FreeBSD $FreeBSD"
@@ -168,7 +152,6 @@ echo "ENV=$ENV" >> $ENV
 # result path
 echo 'RESPATH="/var/tmp/test_results"' >> $ENV
 
-# FreeBSD 13 has problems with: e1000 and virtio
 echo "NIC=$NIC" >> $ENV
 
 # freebsd15 -> used in zfs-qemu.yml
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 9a594db6854..7f3b7b1dee4 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -55,7 +55,7 @@ jobs:
             os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24"]'
             ;;
           freebsd)
-            os_selection='["freebsd13-5r", "freebsd14-4r", "freebsd13-5s", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]'
+            os_selection='["freebsd14-4r", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]'
             ;;
           *)
             # default list
@@ -102,8 +102,8 @@ jobs:
         # debian:  debian12, debian13, ubuntu22, ubuntu24
         # misc:    archlinux, tumbleweed
         # FreeBSD variants of november 2025:
-        # FreeBSD Release: freebsd13-5r, freebsd14-4r, freebsd15-0r
-        # FreeBSD Stable:  freebsd13-5s, freebsd14-4s, freebsd15-1s
+        # FreeBSD Release: freebsd14-4r, freebsd15-0r
+        # FreeBSD Stable:  freebsd14-4s, freebsd15-1s
         # FreeBSD Current: freebsd16-0c
         os: ${{ fromJson(needs.test-config.outputs.test_os) }}
     runs-on: ubuntu-24.04
diff --git a/README.md b/README.md
index fa348a24383..8c877cb8023 100644
--- a/README.md
+++ b/README.md
@@ -68,4 +68,4 @@ Generally, if a distribution is following an LTS kernel, it should work well wit
 
 All FreeBSD releases receiving [security support](https://www.freebsd.org/security/#sup) are supported by OpenZFS.
 
-**Supported FreeBSD releases**: **15.0**, **14.4**, **13.5**.
+**Supported FreeBSD releases**: **15.0**, **14.4**.

From 536c06be82cd001080407a18175b36ed2f0ea84e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Tue, 19 May 2026 09:05:29 +1000
Subject: [PATCH 051/129] config: show progress output for kernel API checks

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18554
---
 config/kernel.m4 | 114 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 3 deletions(-)

diff --git a/config/kernel.m4 b/config/kernel.m4
index b40e34d373f..7225591b86d 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -153,9 +153,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 			;;
 	esac
 
-	AC_MSG_CHECKING([for available kernel interfaces])
-	ZFS_LINUX_TEST_COMPILE_ALL([kabi])
-	AC_MSG_RESULT([done])
+	ZFS_LINUX_TEST_COMPILE_ALL([kabi], [for available kernel interfaces])
 ])
 
 dnl #
@@ -753,6 +751,108 @@ AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [
 	], [], [yes])
 ])
 
+dnl #
+dnl # Progress output for ZFS_LINUX_TEST_COMPILE_ALL
+dnl #
+dnl # From clean, we currently have ~250 kernel tests to compile. This can
+dnl # take anywhere from a few seconds to a few minutes while we wait for
+dnl # the module build invocation to complete (see ZFS_LINUX_COMPILE).
+dnl #
+dnl # To show some progress in the main set of tests, we start a background
+dnl # job to monitor the build progress and update the output.
+dnl #
+AC_DEFUN([_ZFS_LINUX_TEST_COMPILE_PROGRESS_START], [
+	dnl # normal "checking for..." output
+	AC_MSG_CHECKING([$2])
+
+	dnl # don't start the background job if configure was called with
+	dnl # --silent or --quiet, or if configure's output stream is not
+	dnl # attached to a terminal
+	AS_IF([test "x$silent" != "xyes" -a -t AS_MESSAGE_FD], [
+		dnl # save "checking" message for cleanup later
+		_zfs_linux_test_progress_text="$2"
+
+		dnl # new shell job in background
+		(
+			dnl # ZFS_LINUX_CONFTEST_MAKEFILE adds one line per
+			dnl # test to the top Makefile, so the line count
+			dnl # is our target
+			total=$(wc -l < $1/Makefile)
+			count=0
+
+			dnl # eject if our parent process has gone away. this
+			dnl # is protection against the parent being killed.
+			dnl # (we can't use trap because autoconf generates
+			dnl # that and doesn't provide an easy way to hook it).
+			while kill -0 $$ 2>/dev/null ; do
+
+				dnl # ZFS_LINUX_TEST_COMPILE_ALL has a short
+				dnl # second stage for modpost, where build.log
+				dnl # recreated. we make some effort to both
+				dnl # detect that and handle it, mostly by
+				dnl # making sure the counter never goes
+				dnl # backwards.
+				if test "$count" -lt "$total" ; then
+					dnl # if build.log went away, then
+					dnl # we never got to do a last count,
+					dnl # so we can assume they're all
+					dnl # finished and just bump the count
+					dnl # to the total
+					if ! test -f $1/build.log ; then
+						count=$total
+					else
+						dnl # look for compilation lines
+						dnl # (CC) for .o files that
+						dnl # are in a dir (so not
+						dnl # whole-of-build artifacts)
+						dnl # and only have a a single
+						dnl # period (so not .mod.o
+						dnl # link artifacts)
+						count_n=$(awk '/CC/ && /\/[[^\.]]+\.o$/ { c++ } END { print c }' $1/build.log 2>/dev/null)
+						if test "x$count_n" != "x" ; then
+							dnl # empty output
+							dnl # means awk failed,
+							dnl # likely build.log
+							dnl # went away. use
+							dnl # the current count
+							count=$count_n
+						fi
+					fi
+
+					dnl # re-output the entire message with
+					dnl # the new counts
+					printf '\rchecking %s... %d/%d' "$2" "$count" "$total" >&6
+				fi
+
+				dnl # yield before loop
+				sleep 0.5
+			done
+		) &
+
+		dnl # save the pid so we can kill it later
+		_zfs_linux_test_progress_pid=$!
+	])
+])
+
+AC_DEFUN([_ZFS_LINUX_TEST_COMPILE_PROGRESS_DONE], [
+	dnl # only do cleanup if we actually started the job
+	AS_IF([test "x$_zfs_linux_test_progress_pid" != "x"], [
+		dnl # kill it; no-op if it already died
+		kill $_zfs_linux_test_progress_pid 2>/dev/null
+		dnl # wait for it to really go away and clean it up
+		wait $_zfs_linux_test_progress_pid 2>/dev/null
+		dnl # reprint the original checking line. the control code
+		dnl # is ANSI "erase entire line"
+		printf '\r\033\1332Kchecking %s... ' "$_zfs_linux_test_progress_text" >&AS_MESSAGE_FD
+		dnl # cleanup for next run
+		_zfs_linux_test_progress_pid=
+		_zfs_linux_test_progress_text=
+	])
+
+	dnl # normal final output for screen and config.log
+	AC_MSG_RESULT([$1])
+])
+
 dnl #
 dnl # Perform the compilation of the test cases in two phases.
 dnl #
@@ -771,6 +871,10 @@ dnl # The maximum allowed parallelism can be controlled by setting the
 dnl # TEST_JOBS environment variable.  Otherwise, it default to $(nproc).
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [
+	AS_IF([test "x$2" != "x"], [
+		_ZFS_LINUX_TEST_COMPILE_PROGRESS_START([build], [$2])
+	])
+
 	dnl # Phase 1 - Compilation only, final linking is skipped.
 	ZFS_LINUX_TEST_COMPILE([$1], [build])
 
@@ -818,6 +922,10 @@ AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [
 			])
 		done
 	])
+
+	AS_IF([test "x$2" != "x"], [
+		_ZFS_LINUX_TEST_COMPILE_PROGRESS_DONE([done])
+	])
 ])
 
 dnl #

From ea7fd8a7bc97fd32a2e0b5687f1a086c31231046 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 19 May 2026 18:08:48 +0200
Subject: [PATCH 052/129] libzfs_pool: add docstrings to several public
 functions

Cover a number of frequently-used functions that previously had no
documentation.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18538
---
 lib/libzfs/libzfs_pool.c | 42 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 7c4c081edb4..f82211699f5 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2583,6 +2583,10 @@ xlate_init_err(int err)
 	return (err);
 }
 
+/*
+ * Start (or cancel/suspend/uninit) the initialize operation on every
+ * leaf vdev of the pool.
+ */
 int
 zpool_initialize_one(zpool_handle_t *zhp, void *data)
 {
@@ -2762,6 +2766,10 @@ zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
 	}
 }
 
+/*
+ * Start (or cancel/suspend) the trim operation on every leaf vdev of
+ * the pool.
+ */
 int
 zpool_trim_one(zpool_handle_t *zhp, void *data)
 {
@@ -3409,6 +3417,11 @@ __zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
 	return (ret);
 }
 
+/*
+ * Look up a vdev in the pool by path, name, or guid.  Returns the
+ * vdev's configuration nvlist, or NULL on no match.  Also, fills
+ * in avail_spare, l2cache, and log if they are non-NULL.
+ */
 nvlist_t *
 zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
     boolean_t *l2cache, boolean_t *log)
@@ -4653,7 +4666,10 @@ zpool_reopen_one(zpool_handle_t *zhp, void *data)
 	return (0);
 }
 
-/* call into libzfs_core to execute the sync IOCTL per pool */
+/*
+ * Block until every buffered write for the pool has reached the
+ * underlying disks.
+ */
 int
 zpool_sync_one(zpool_handle_t *zhp, void *data)
 {
@@ -4929,6 +4945,10 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
 	return (0);
 }
 
+/*
+ * Format the program name and its command-line arguments into a single
+ * space-separated string.
+ */
 void
 zfs_save_arguments(int argc, char **argv, char *string, int len)
 {
@@ -4941,6 +4961,10 @@ zfs_save_arguments(int argc, char **argv, char *string, int len)
 	}
 }
 
+/*
+ * Append a message to the pool's command-history log, retrievable via
+ * "zpool history".
+ */
 int
 zpool_log_history(libzfs_handle_t *hdl, const char *message)
 {
@@ -5236,6 +5260,11 @@ zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
 	free(mntpnt);
 }
 
+/*
+ * Translate a (dataset object id, file object id) pair into a readable
+ * path.  If the dataset is mounted the result is an absolute filesystem
+ * path; otherwise it is `dataset:path`.
+ */
 void
 zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
     char *pathname, size_t len)
@@ -5243,6 +5272,10 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
 	zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE);
 }
 
+/*
+ * Translate a (dataset object id, file object id) pair into a
+ * `dataset:path` string.
+ */
 void
 zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
     char *pathname, size_t len)
@@ -5297,6 +5330,10 @@ zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity,
 	return (error);
 }
 
+/*
+ * Store a boot configuration map in the bootenv area of each leaf
+ * vdev's labels.
+ */
 int
 zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap)
 {
@@ -5310,6 +5347,9 @@ zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap)
 	return (error);
 }
 
+/*
+ * Read the boot configuration map from each leaf vdev's bootenv area.
+ */
 int
 zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp)
 {

From e5473afe18a013ec76070e8ef10597f7cda153f2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 20 May 2026 02:11:31 +1000
Subject: [PATCH 053/129] spl_kvmalloc: remove __GFP_COMP before calling
 vmalloc()

In cb1833023 we stopped using it for KM_VMEM allocations, since its not
a valid flag for vmalloc(). However, there's a fallback path for
non-KM_VMEM allocations to use vmalloc(), and we need to remove
__GFP_COMP there too to avoid a warning.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18558
---
 module/os/linux/spl/spl-kmem.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c
index 9fe4042b507..6e340261980 100644
--- a/module/os/linux/spl/spl-kmem.c
+++ b/module/os/linux/spl/spl-kmem.c
@@ -188,6 +188,12 @@ spl_kvmalloc(size_t size, gfp_t lflags)
 		return (ptr);
 	}
 
+	/*
+	 * vmalloc fallback. KM_VMEM may not have been requested originally if
+	 * we've come through spl_kmem_alloc_impl(), so we need to remove
+	 * __GFP_COMP, which is not a valid flag for vmalloc.
+	 */
+	lflags &= ~__GFP_COMP;
 	return (spl_vmalloc(size, lflags));
 }
 

From 5fde52c3f9d293fe05b7206494d97278490a9b75 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 14 May 2026 13:25:44 -0700
Subject: [PATCH 054/129] CI: Add Ubuntu 26.04 builder

The Ubuntu 26.04 LTS, named "Resolute Raccoon, was released on
April 23, 2026.  Add to the supported releases in README.md and
add a CI builder for it.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18547
---
 .github/workflows/scripts/qemu-2-start.sh   |  5 +++++
 .github/workflows/scripts/qemu-3-deps-vm.sh | 25 +++++++++++++++++++--
 .github/workflows/zfs-qemu.yml              |  8 +++----
 README.md                                   |  2 +-
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 5684b3ca2e9..3eba651ae6d 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -139,6 +139,11 @@ case "$OS" in
     OSv="ubuntu24.04"
     URL="$UBMIRROR/noble/current/noble-server-cloudimg-amd64.img"
     ;;
+  ubuntu26)
+    OSNAME="Ubuntu 26.04"
+    OSv="ubuntu24.04"
+    URL="$UBMIRROR/resolute/current/resolute-server-cloudimg-amd64.img"
+    ;;
   *)
     echo "Wrong value for OS variable!"
     exit 111
diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh
index 6a83ef45fd2..d31a4725b5c 100755
--- a/.github/workflows/scripts/qemu-3-deps-vm.sh
+++ b/.github/workflows/scripts/qemu-3-deps-vm.sh
@@ -215,7 +215,7 @@ case "$1" in
   tumbleweed)
     tumbleweed
     ;;
-  ubuntu*)
+  ubuntu22|ubuntu24)
     debian
     echo "##[group]Install Ubuntu specific"
     sudo apt-get install -yq linux-tools-common libtirpc-dev \
@@ -226,6 +226,27 @@ case "$1" in
     # https://github.com/actions/runner-images/issues/9946
     sudo apt-get install -yq build-essential
 
+    echo "##[endgroup]"
+    echo "##[group]Delete Ubuntu OpenZFS modules"
+    for i in $(find /lib/modules -name zfs -type d); do sudo rm -rvf $i; done
+    echo "##[endgroup]"
+    ;;
+  ubuntu26)
+    debian
+    echo "##[group]Install Ubuntu specific"
+    # Skip linux-modules-extra which is already installed
+    sudo apt-get install -yq linux-tools-common
+    sudo apt-get install -yq libtirpc-dev
+    sudo apt-get install -yq dh-sequence-dkms
+
+    # Need 'build-essential' explicitly for ARM builder
+    # https://github.com/actions/runner-images/issues/9946
+    sudo apt-get install -yq build-essential
+
+    # Replace sudo-rs with sudo for now because the Rust version
+    # does not support -E to preserve the entire environment
+    sudo update-alternatives --set sudo /usr/bin/sudo.ws
+
     echo "##[endgroup]"
     echo "##[group]Delete Ubuntu OpenZFS modules"
     for i in $(find /lib/modules -name zfs -type d); do sudo rm -rvf $i; done
@@ -292,7 +313,7 @@ case "$1" in
     echo 'GRUB_SERIAL_COMMAND="serial --speed=115200"' \
       | sudo tee -a /etc/default/grub >/dev/null
     ;;
-  ubuntu24)
+  ubuntu24|ubuntu26)
     GRUB_CFG="/boot/grub/grub.cfg"
     GRUB_MKCONFIG="grub-mkconfig"
     echo 'GRUB_DISABLE_OS_PROBER="false"' \
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 7f3b7b1dee4..500e54ad84f 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -49,17 +49,17 @@ jobs:
             os_selection='[]'
             ;;
           quick)
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora44", "freebsd15-1s", "ubuntu24"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora44", "freebsd15-1s", "ubuntu26"]'
             ;;
           linux)
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24", "ubuntu26"]'
             ;;
           freebsd)
             os_selection='["freebsd14-4r", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]'
             ;;
           *)
             # default list
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24", "ubuntu26"]'
             ;;
           esac
 
@@ -99,7 +99,7 @@ jobs:
       fail-fast: false
       matrix:
         # rhl:     almalinux8, almalinux9, centos-streamX, fedora4x
-        # debian:  debian12, debian13, ubuntu22, ubuntu24
+        # debian:  debian12, debian13, ubuntu22, ubuntu24, ubuntu26
         # misc:    archlinux, tumbleweed
         # FreeBSD variants of november 2025:
         # FreeBSD Release: freebsd14-4r, freebsd15-0r
diff --git a/README.md b/README.md
index 8c877cb8023..6a77cedb635 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ All RHEL (and compatible systems: AlmaLinux OS, Rocky Linux, etc) on the **full*
 
 All Ubuntu **LTS** releases are supported.
 
-**Supported Ubuntu releases**: **24.04 “Noble”**, **22.04 “Jammy”**.
+**Supported Ubuntu releases**: **26.04 “Resolute”**, **24.04 “Noble”**, **22.04 “Jammy”**.
 
 ### Debian
 

From bd2f0aa0574f640cc349f7c9db16221127b72a96 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 15 May 2026 12:31:12 -0700
Subject: [PATCH 055/129] CI: Fix qemu-guest-agent systemd enable

The qemu-guest-agent.service for Debian and Ubuntu does
not contain an install section which prevents it from
being enabled.  Add a drop-in override file so it can
be enabled and the service started on boot.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18547
---
 .github/workflows/scripts/qemu-3-deps-vm.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh
index d31a4725b5c..82cda8bf5bc 100755
--- a/.github/workflows/scripts/qemu-3-deps-vm.sh
+++ b/.github/workflows/scripts/qemu-3-deps-vm.sh
@@ -288,8 +288,16 @@ case "$1" in
     ;;
   debian*|ubuntu*)
     sudo -E systemctl enable nfs-kernel-server
-    sudo -E systemctl enable qemu-guest-agent
     sudo -E systemctl enable smbd
+
+    # add systemd drop-in to allow the service to be enabled
+    sudo -E mkdir -p /etc/systemd/system/qemu-guest-agent.service.d/
+    sudo -E tee /etc/systemd/system/qemu-guest-agent.service.d/override.conf <<EOF
+[Install]
+WantedBy=multi-user.target
+EOF
+    sudo -E systemctl daemon-reload
+    sudo -E systemctl enable qemu-guest-agent
     ;;
   *)
     # All other linux distros

From c59d690e567f3932ed8ac65b37bed4f1621282d5 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 15 May 2026 11:04:04 -0700
Subject: [PATCH 056/129] ZTS: Pass dec instead of hex to mknod

On Ubuntu 26.04 the default mknod command returns an error when
provided the major and minor numbers in hex.  Switch to passing
decimal values.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18547
---
 .../tests/functional/devices/devices_common.kshlib   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/devices/devices_common.kshlib b/tests/zfs-tests/tests/functional/devices/devices_common.kshlib
index 8024067ac9e..3298b49fec7 100644
--- a/tests/zfs-tests/tests/functional/devices/devices_common.kshlib
+++ b/tests/zfs-tests/tests/functional/devices/devices_common.kshlib
@@ -54,9 +54,9 @@ function create_dev_file
 			# %t - major device type in hex
 			# %T - minor device type in hex
 			#
-			major=$(stat --dereference --format="%t" "$devstr")
-			minor=$(stat --dereference --format="%T" "$devstr")
-			log_must mknod $filename b "0x${major}" "0x${minor}"
+			major=$(printf '%d' 0x$(stat -L -c "%t" "$devstr"))
+			minor=$(printf '%d' 0x$(stat -L -c "%T" "$devstr"))
+			log_must mknod $filename b "${major}" "${minor}"
 			;;
 		*)
 			#
@@ -83,9 +83,9 @@ function create_dev_file
 			# %t - major device type in hex
 			# %T - minor device type in hex
 			#
-			major=$(stat --format="%t" /dev/null)
-			minor=$(stat --format="%T" /dev/null)
-			log_must mknod $filename c "0x${major}" "0x${minor}"
+			major=$(printf '%d' 0x$(stat -c "%t" /dev/null))
+			minor=$(printf '%d' 0x$(stat -c "%T" /dev/null))
+			log_must mknod $filename c "${major}" "${minor}"
 			;;
 		FreeBSD)
 			#

From d64dcd257513a9bff6368541e0c19cd6b3c986b1 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 18 May 2026 12:19:24 -0700
Subject: [PATCH 057/129] ZTS: statx_dioalign.ksh update to stride_dd

The uutils 0.8.0 version of dd appears to diverge from GNU behavior
and does not fail when an unaligned write O_DIRECT write is issued.
Update the test case to use stride_dd which is provided by the ZTS
so the expected syscall behavior can be verified.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18547
---
 .../tests/functional/stat/statx_dioalign.ksh          | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh b/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh
index ab749b5f793..ea10e492503 100755
--- a/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh
+++ b/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh
@@ -89,7 +89,8 @@ typeset -i PAGE_SIZE=$(getconf PAGE_SIZE)
 # Set recordsize to 128K, and make a 64K file (so only one block) for the
 # sizing tests below.
 log_must zfs set recordsize=128K $TESTDS
-log_must dd if=/dev/urandom of=$TESTFILE bs=64k count=1
+log_must rm -f $TESTFILE
+log_must stride_dd -i /dev/urandom -o $TESTFILE -b 65536 -c 1
 log_must zpool sync
 
 # when DIO is disabled via tunable, statx will not return the dioalign result
@@ -141,7 +142,7 @@ done
 # Now we extend the file into its second block. This effectively locks in its
 # block size, which will always be returned regardless of recordsize changes.
 log_must zfs set recordsize=128K $TESTDS
-log_must dd if=/dev/urandom of=$TESTFILE bs=192K count=1
+log_must stride_dd -i /dev/urandom -o $TESTFILE -b 196608 -c 1
 log_must zpool sync
 
 # Confirm that no matter how we change the recordsize, the alignment remains at
@@ -167,14 +168,14 @@ log_must rm -f $TESTFILE
 log_must touch $TESTFILE
 log_must zpool sync
 assert_dioalign $TESTFILE $PAGE_SIZE 16384
-log_must dd if=/dev/urandom of=$TESTFILE bs=16384 count=16 oflag=direct
+log_must stride_dd -i /dev/urandom -o $TESTFILE -b 16384 -c 16 -D
 
 # same again, but writing with incorrect alignment, which should fail.
 log_must rm -f $TESTFILE
 log_must touch $TESTFILE
 log_must zpool sync
 assert_dioalign $TESTFILE $PAGE_SIZE 16384
-log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
+log_mustnot stride_dd -i /dev/urandom -o $TESTFILE -b 1024 -c 256 -D
 
 # same again, but without strict, which should succeed.
 log_must set_tunable32 DIO_STRICT 0
@@ -182,6 +183,6 @@ log_must rm -f $TESTFILE
 log_must touch $TESTFILE
 log_must zpool sync
 assert_dioalign $TESTFILE $PAGE_SIZE 16384
-log_must dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
+log_must stride_dd -i /dev/urandom -o $TESTFILE -b 1024 -c 256 -D
 
 log_pass $CLAIM

From f9bf31ff7a55eb038d96f4adbfde0394d9bb1ae7 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 19 May 2026 18:54:56 -0700
Subject: [PATCH 058/129] ZTS: zfs_unshare_006_pos.ksh enable usershares

Ensure samba usershares are enabled in the CI test environment for
the zfs_unshare_006_pos test case.  By default they are disabled
in the Ubuntu 26.04 LTS and must be enabled.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18547
---
 .github/workflows/scripts/qemu-3-deps-vm.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh
index 82cda8bf5bc..d61e97cf423 100755
--- a/.github/workflows/scripts/qemu-3-deps-vm.sh
+++ b/.github/workflows/scripts/qemu-3-deps-vm.sh
@@ -290,6 +290,9 @@ case "$1" in
     sudo -E systemctl enable nfs-kernel-server
     sudo -E systemctl enable smbd
 
+    # enable usershares (disabled by default on ubuntu 26.04)
+    sudo -E sed -i '/usershare max shares/s/^#//' /etc/samba/smb.conf
+
     # add systemd drop-in to allow the service to be enabled
     sudo -E mkdir -p /etc/systemd/system/qemu-guest-agent.service.d/
     sudo -E tee /etc/systemd/system/qemu-guest-agent.service.d/override.conf <<EOF

From b394b8742e6f11b19c220f872473bd57b6ef7358 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Tue, 19 May 2026 12:50:39 +1000
Subject: [PATCH 059/129] ZTS/zfs_mount: lift & update helpers from
 zfs_mount_remount

zfs_mount_remount has some nice helpers for checking the claimed and
actual read-only/read-write state of a mount. I wanted to use them for
another test but they weren't exactly what I wanted.

This adds separate functions for the different kinds of mounts the
zfs_mount_remount test wants to use, mostly to avoid the assymetry of
sometimes calling a helper function and sometimes doing it direct. It
also separates the code to get the current ro/rw mount option from
actually asserting it.

Test has been updated to use the new functions, but the logic and
structure has not changed.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18557
Closes #18563
---
 .../cli_root/zfs_mount/zfs_mount.kshlib       | 128 ++++++++++++++++++
 .../cli_root/zfs_mount/zfs_mount_remount.ksh  | 103 ++++----------
 2 files changed, 156 insertions(+), 75 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
index 08795a7ea25..5d7ceb97112 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
@@ -27,6 +27,8 @@
 
 #
 # Copyright (c) 2017 by Delphix. All rights reserved.
+# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+# Copyright (c) 2026, TrueNAS.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -131,3 +133,129 @@ function verify_mount_display
 	done
 	return 0
 }
+
+# Helper functions to call the system mount(8) with various options
+function mount_default # <dataset mountpoint | mountpoint>
+{
+	typeset opts=
+	if is_freebsd; then
+		opts="-t zfs"
+	else
+		opts="-t zfs"
+	fi
+
+	mount $opts "$@"
+	return $?
+}
+
+function mount_ro # <dataset mountpoint | mountpoint>
+{
+	typeset opts=
+	if is_freebsd; then
+		opts="-t zfs -r"
+	else
+		opts="-t zfs -o ro"
+	fi
+
+	mount $opts "$@"
+	return $?
+}
+
+function mount_rw # <dataset mountpoint | mountpoint>
+{
+	typeset opts=
+	if is_freebsd; then
+		opts="-t zfs -w"
+	else
+		opts="-t zfs -o rw"
+	fi
+
+	mount $opts "$@"
+	return $?
+}
+
+function remount_ro # <dataset mountpoint | mountpoint>
+{
+	typeset opts=
+	if is_freebsd; then
+		opts="-t zfs -ur"
+	else
+		opts="-o remount,ro"
+	fi
+
+	mount $opts "$@"
+	return $?
+}
+
+function remount_rw # <dataset mountpoint | mountpoint>
+{
+	typeset opts=
+	if is_freebsd; then
+		opts="-t zfs -uw"
+	else
+		opts="-o remount,rw"
+	fi
+
+	mount $opts "$@"
+	return $?
+}
+
+#
+# Verify that $mountpoint is mounted readonly
+# This is preferred over "log_mustnot touch $fs" because we actually want to
+# verify the error returned is EROFS
+#
+function mount_is_ro # mountpoint
+{
+	typeset mountpoint="$1"
+
+	file_write -o create -f $mountpoint/file.dat
+	ret=$?
+	if [[ $ret != 30 ]]; then
+		log_fail "Writing to $mountpoint did not return EROFS ($ret)."
+	fi
+}
+
+function mount_is_rw # mountpoint
+{
+	typeset mountpoint="$1"
+	log_must touch $mountpoint/file.dat
+}
+
+# Get the read-only/read-write option for $mountpoint
+# Prints either "ro" or "rw", or nothing if $mountpoint is not in the mount
+# table, or is not a ZFS mount.
+function mount_get_ro_rw # mountpoint
+{
+	typeset mountpoint="$1"
+
+	if is_freebsd; then
+		# tank/hello  /tank/hello  zfs  rw,nfsv4acls  0 0
+		mount -p | \
+		awk -v mountpoint="$mountpoint" '
+		    $2 != mountpoint || $3 != "zfs" { next }
+		    $4 ~ /(^|,)ro(,|$)/ { print "ro" }
+		    $4 ~ /(^|,)rw(,|$)/ { print "rw" }'
+	else
+		# tank/hello  /tank/hello  zfs  rw,relatime,xattr,noacl,casesensitive  0 0
+		awk -v mountpoint="$mountpoint" '
+		    $2 != mountpoint || $3 != "zfs" { next }
+		    $4 ~ /(^|,)ro(,|$)/ { print "ro" }
+		    $4 ~ /(^|,)rw(,|$)/ { print "rw" }' /proc/mounts
+	fi
+}
+
+# Verify that $mountpoint is mounted with a "read-only" option
+function mount_has_ro_option # mountpoint
+{
+	typeset ropt=$(mount_get_ro_rw "$1")
+	log_must test $ropt == "ro"
+}
+
+# Verify that $mountpoint is mounted with a "read-write" option
+function mount_has_rw_option # mountpoint
+{
+	typeset ropt=$(mount_get_ro_rw "$1")
+	log_must test $ropt == "rw"
+}
+
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh
index c54128f7b9e..a16d17a1229 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh
@@ -23,6 +23,7 @@
 
 #
 # Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+# Copyright (c) 2026, TrueNAS.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -54,54 +55,6 @@ function cleanup
 	return 0
 }
 
-if is_freebsd; then
-	typeset RO="-t zfs -ur"
-	typeset RW="-t zfs -uw"
-else
-	typeset RO="-o remount,ro"
-	typeset RW="-o remount,rw"
-fi
-
-#
-# Verify the $filesystem is mounted readonly
-# This is preferred over "log_mustnot touch $fs" because we actually want to
-# verify the error returned is EROFS
-#
-function readonlyfs # filesystem
-{
-	typeset filesystem="$1"
-
-	file_write -o create -f $filesystem/file.dat
-	ret=$?
-	if [[ $ret != 30 ]]; then
-		log_fail "Writing to $filesystem did not return EROFS ($ret)."
-	fi
-}
-
-#
-# Verify $dataset is mounted with $option
-#
-function checkmount # dataset option
-{
-	typeset dataset="$1"
-	typeset option="$2"
-	typeset options=""
-
-	if is_freebsd; then
-		options=$(mount -p | awk -v ds="$dataset" '$1 == ds { print $4 }')
-	else
-		options=$(awk -v ds="$dataset" '$1 == ds { print $4 }' /proc/mounts)
-	fi
-	if [[ "$options" == '' ]]; then
-		log_fail "Dataset $dataset is not mounted"
-	elif [[ ! -z "${options##*$option*}" ]]; then
-		log_fail "Dataset $dataset is not mounted with expected "\
-		    "option $option ($options)"
-	else
-		log_note "Dataset $dataset is mounted with option $option"
-	fi
-}
-
 log_assert "Verify remount functionality on both filesystem and snapshots"
 
 log_onexit cleanup
@@ -117,35 +70,35 @@ MNTPSNAP="$TESTDIR/zfs_snap_mount"
 log_must mkdir -p $MNTPSNAP
 
 # 2. Verify we can (re)mount the dataset readonly/read-write
-log_must touch $MNTPFS/file.dat
-checkmount $TESTFS 'rw'
-log_must mount $RO $TESTFS $MNTPFS
-readonlyfs $MNTPFS
-checkmount $TESTFS 'ro'
-log_must mount $RW $TESTFS $MNTPFS
-log_must touch $MNTPFS/file.dat
-checkmount $TESTFS 'rw'
+mount_is_rw $MNTPFS
+mount_has_rw_option $MNTPFS
+log_must remount_ro $TESTFS $MNTPFS
+mount_is_ro $MNTPFS
+mount_has_ro_option $MNTPFS
+log_must remount_rw $TESTFS $MNTPFS
+mount_is_rw $MNTPFS
+mount_has_rw_option $MNTPFS
 
 if is_linux; then
 	# 3. Verify we can (re)mount the snapshot readonly
-	log_must mount -t zfs $TESTSNAP $MNTPSNAP
-	readonlyfs $MNTPSNAP
-	checkmount $TESTSNAP 'ro'
-	log_must mount $RO $TESTSNAP $MNTPSNAP
-	readonlyfs $MNTPSNAP
-	checkmount $TESTSNAP 'ro'
+	log_must mount_default $TESTSNAP $MNTPSNAP
+	mount_is_ro $MNTPSNAP
+	mount_has_ro_option $MNTPSNAP
+	log_must remount_ro $TESTSNAP $MNTPSNAP
+	mount_is_ro $MNTPSNAP
+	mount_has_ro_option $MNTPSNAP
 	log_must umount $MNTPSNAP
 fi
 
 # 4. Verify we can't remount a snapshot read-write
 # The "mount -o rw" command will succeed but the snapshot is mounted readonly.
 # The "mount -o remount,rw" command must fail with an explicit error.
-log_must mount -t zfs -o rw $TESTSNAP $MNTPSNAP
-readonlyfs $MNTPSNAP
-checkmount $TESTSNAP 'ro'
-log_mustnot mount $RW $TESTSNAP $MNTPSNAP
-readonlyfs $MNTPSNAP
-checkmount $TESTSNAP 'ro'
+log_must mount_rw $TESTSNAP $MNTPSNAP
+mount_is_ro $MNTPSNAP
+mount_has_ro_option $MNTPSNAP
+log_mustnot remount_rw $TESTSNAP $MNTPSNAP
+mount_is_ro $MNTPSNAP
+mount_has_ro_option $MNTPSNAP
 log_must umount $MNTPSNAP
 
 # 5. Verify we can remount a dataset readonly and unmount it with
@@ -153,8 +106,8 @@ log_must umount $MNTPSNAP
 log_must eval "echo 'password' | zfs create -o sync=disabled \
     -o encryption=on -o keyformat=passphrase $TESTFS/crypt"
 CRYPT_MNTPFS="$(get_prop mountpoint $TESTFS/crypt)"
-log_must touch $CRYPT_MNTPFS/file.dat
-log_must mount $RO $TESTFS/crypt $CRYPT_MNTPFS
+mount_is_rw $CRYPT_MNTPFS
+log_must remount_ro $TESTFS/crypt $CRYPT_MNTPFS
 log_must umount -f $CRYPT_MNTPFS
 sync_pool $TESTPOOL
 
@@ -163,10 +116,10 @@ log_must zpool export $TESTPOOL
 log_must zpool import -o readonly=on $TESTPOOL
 
 # 7. Verify we can't remount its filesystem read-write
-readonlyfs $MNTPFS
-checkmount $TESTFS 'ro'
-log_mustnot mount $RW $MNTPFS
-readonlyfs $MNTPFS
-checkmount $TESTFS 'ro'
+mount_is_ro $MNTPFS
+mount_has_ro_option $MNTPFS
+log_mustnot remount_rw $MNTPFS
+mount_is_ro $MNTPFS
+mount_has_ro_option $MNTPFS
 
 log_pass "Both filesystem and snapshots can be remounted correctly."

From 20437d856cf0dcf8d562b7b32fc8858fafb456e7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Tue, 19 May 2026 14:27:12 +1000
Subject: [PATCH 060/129] ZTS/zfs_mount: test that ro/rw mount methods remain
 consistent

Whether a mount ends up as read-only or read-write depends on a
combination of platform, readonly= filesystem property, mount method
(system mount(8) or zfs-mount(8)) and mount option provided (ro, rw or
none).

This tests all combinations, and ensures they match what has
traditionally been expected on this platform, so we'll know if we
accidentally changed it.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18557
Closes #18563
---
 tests/runfiles/common.run                     |   4 +-
 tests/runfiles/sanity.run                     |   2 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../cli_root/zfs_mount/zfs_mount_ro_rw.ksh    | 130 ++++++++++++++++++
 4 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 82a2d1e815e..6e62b552a0d 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -271,8 +271,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
-    'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
-    'zfs_mount_test_race', 'zfs_mount_recursive']
+    'zfs_mount_remount', 'zfs_mount_ro_rw', 'zfs_mount_all_fail',
+    'zfs_mount_all_mountpoints', 'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index d62f8f3fb16..0deaa038a31 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -156,7 +156,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
+    'zfs_mount_ro_rw', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
     'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 85f00f28b0f..98f39253882 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -814,6 +814,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
+	functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
 	functional/cli_root/zfs_program/cleanup.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh
new file mode 100755
index 00000000000..15e78e6fd88
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh
@@ -0,0 +1,130 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
+
+#
+# we set up and mount multiple times, with these combinations:
+# - readonly property: on, off
+# - mount method: mount(8) (mountpoint=legacy), zfs-mount(8) (mountpoint=path)
+# - mount option: [none], ro, rw
+#
+# after each mount, we check whether we ended up mounting read-only or
+# read-write, and note the result. once we've done them all, we compare the
+# result set to the "correct" set for this platform (by observation). the
+# test passes if they match, fail if they don't
+#
+#        readonly     |         on          |         off         |
+#        mount method |  legacy  |   path   |  legacy  |   path   |
+#        mount option | -- ro rw | -- ro rw | -- ro rw | -- ro rw |
+typeset -a rs_linux=(   rw ro rw   ro ro rw   rw ro rw   rw ro rw )
+typeset -a rs_freebsd=( ro ro ro   ro ro rw   rw ro rw   rw ro rw )
+
+if is_linux ; then
+    typeset -n rs_wanted=rs_linux
+elif is_freebsd ; then
+    typeset -n rs_wanted=rs_freebsd
+else
+    log_unsupported "no result set defined for this platform"
+fi
+
+verify_runnable "both"
+
+testfs=$TESTPOOL/$TESTFS
+testmnt=$TESTDIR/mountpoint
+
+function cleanup
+{
+	log_must zfs inherit -S canmount $testfs
+	log_must zfs inherit readonly $testfs
+	log_must zfs inherit mountpoint $testfs
+	log_must rm -rf $testmnt
+}
+
+log_assert "Verify combinations of readonly/readwrite produce correct mount."
+
+log_onexit cleanup
+
+
+# setup
+log_must datasetexists $testfs
+log_must zfs set canmount=noauto $testfs
+umount $testfs
+
+
+typeset -a rs=()
+
+for readonly in on off ; do
+	for method in legacy path ; do
+		for option in default ro rw ; do
+
+			log_must zfs set readonly=$readonly $testfs
+
+			if [[ $method == 'legacy' ]] ; then
+				log_must zfs set mountpoint=legacy $testfs
+			else
+				log_must zfs set mountpoint=$testmnt $testfs
+			fi
+
+			# recreate the mountpoint. even if it wasn't mounted,
+			# changing the mountpoint property can remove it
+			log_must mkdir -p $testmnt
+
+			# issue the mount with the wanted method and option
+			case $method in
+			legacy)
+				case $option in
+				default) log_must mount_default $testfs $testmnt ;;
+				ro)      log_must mount_ro $testfs $testmnt ;;
+				rw)      log_must mount_rw $testfs $testmnt ;;
+				esac
+			;;
+			path)
+				case $option in
+				default)  log_must zfs mount $testfs ;;
+				ro)       log_must zfs mount -o ro $testfs ;;
+				rw)       log_must zfs mount -o rw $testfs ;;
+				esac
+			;;
+			esac
+
+			result=$(mount_get_ro_rw $testmnt)
+			rs+=($result)
+			log_note "result: $result"
+
+			log_must umount $testfs
+		done
+	done
+done
+
+log_note "results: ${rs[@]}"
+log_note "wanted:  ${rs_wanted[@]}"
+
+log_must test "${rs[*]}" == "${rs_wanted[*]}"
+
+log_pass "All mounts correct for this platform."

From 58d719442682ade57c85e7bfa34a354f0698bfdd Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 20 May 2026 09:42:19 +1000
Subject: [PATCH 061/129] linux/super: properly apply ro/rw mount option to
 superblock

f5a9e3a622 changed how SB_RDONLY was applied to the new mount in a way
that was too simplistic - it only sets readonly on the filesystem if the
mount was 'ro', but it never clears it if the mount was 'rw'. This
causes the 'rw' option to effectively be ignored, and so the readonly=
property wins out.

This fixes it by doing it the right way: checking the flags mask to see
if it was actually provided as an option at all, and then setting or
clearing it as appropriate.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18557
Closes #18563
---
 module/os/linux/zfs/zpl_super.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index c1460edd16d..e1fa0f8e88e 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -883,9 +883,14 @@ zpl_get_tree(struct fs_context *fc)
 	if (sb->s_root == NULL) {
 		vfs_t *vfs = fc->fs_private;
 
-		/* Apply readonly flag as mount option */
-		if (fc->sb_flags & SB_RDONLY) {
-			vfs->vfs_readonly = B_TRUE;
+		/*
+		 * If SB_RDONLY was set/cleared from mount options, update
+		 * them in the options struct so we set up the filesystem
+		 * in the proper state.
+		 */
+		if (fc->sb_flags_mask & SB_RDONLY) {
+			vfs->vfs_readonly =
+			    (fc->sb_flags & SB_RDONLY) ? B_TRUE : B_FALSE;
 			vfs->vfs_do_readonly = B_TRUE;
 		}
 

From 15761954d74baf51c1d03f816087e6d4037eed15 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 21 May 2026 09:23:32 -0700
Subject: [PATCH 062/129] CI: Build custom branch from zfs-qemu-packages

The zfs-qemu-packages workflow allows us to easily build RPMs for the
current branch.  However, there can be cases where we want to use the
current CI environment to build older releases.  This can happen when
the VM or runner environment changes, and the older CI doesn't have
the updates needed to run with it anymore.

This commit adds in a text box to specify a specific branch/tag to build
using the current CI environment.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18569
---
 .github/workflows/scripts/qemu-4-build-vm.sh | 32 ++++++++++++++++++--
 .github/workflows/zfs-qemu-packages.yml      | 12 ++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/qemu-4-build-vm.sh b/.github/workflows/scripts/qemu-4-build-vm.sh
index bd77f2c5ca1..dfe70c4f1ef 100755
--- a/.github/workflows/scripts/qemu-4-build-vm.sh
+++ b/.github/workflows/scripts/qemu-4-build-vm.sh
@@ -5,10 +5,12 @@
 #
 # Usage:
 #
-#       qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM]
-#               [--poweroff][--release][--repo][--tarball]
+#       qemu-4-build-vm.sh OS [--custom-branch BRANCH][--enable-debug][--dkms]
+#               [--patch-level NUM][--poweroff][--release][--repo][--tarball]
 #
 # OS:           OS name like 'fedora41'
+# --custom-branch: When building packages, checkout this version of ZFS to
+#                  build, but use the current CI scripts to do it.
 # --enable-debug:  Build RPMs with '--enable-debug' (for testing)
 # --dkms:       Build DKMS RPMs as well
 # --patch-level NUM:    Use a custom patch level number for packages.
@@ -27,8 +29,27 @@ POWEROFF=""
 RELEASE=""
 REPO=""
 TARBALL=""
+CUSTOM_BRANCH=""
+PREV_BRANCH=""
+
+cleanup() {
+  if [ -n "$PREV_BRANCH" ] ; then
+    git checkout $PREV_BRANCH
+  fi
+}
+
 while [[ $# -gt 0 ]]; do
   case $1 in
+    --custom-branch)
+      CUSTOM_BRANCH="$2"
+      # If the user specifies a custom tag/branch to build, and the build
+      # fails, we want to make sure our workflow scripts are restored to the
+      # current (more modern) versions so the subsequent CI steps use those.
+      shift
+      shift
+      PREV_BRANCH=$(git branch --show-current)
+      trap 'cleanup' ERR
+      ;;
     --enable-debug)
       ENABLE_DEBUG=1
       shift
@@ -367,6 +388,11 @@ if [ -n "$ENABLE_DEBUG" ] ; then
   extra="--enable-debug"
 fi
 
+if [ -n "$CUSTOM_BRANCH" ] ; then
+  git fetch --unshallow
+  git checkout $CUSTOM_BRANCH
+fi
+
 # build
 case "$OS" in
   freebsd*)
@@ -393,6 +419,8 @@ case "$OS" in
     ;;
 esac
 
+git checkout $PREV_BRANCH
+PREV_BRANCH=""
 
 # building the zfs module was ok
 echo 0 > /var/tmp/build-exitcode.txt
diff --git a/.github/workflows/zfs-qemu-packages.yml b/.github/workflows/zfs-qemu-packages.yml
index 25afb77233c..e3333086e62 100644
--- a/.github/workflows/zfs-qemu-packages.yml
+++ b/.github/workflows/zfs-qemu-packages.yml
@@ -42,6 +42,11 @@ on:
         required: false
         default: ""
         description: "(optional) repo URL (blank: use http://download.zfsonlinux.org)"
+      custom_branch:
+        type: string
+        required: false
+        default: ""
+        description: "(optional) custom tag/branch to build using current CI (like 'zfs-2.2.9')"
       lookup:
         type: boolean
         required: false
@@ -94,9 +99,16 @@ jobs:
                 if [ -n "${{ github.event.inputs.patch_level }}" ] ; then
                         EXTRA="--patch-level ${{ github.event.inputs.patch_level }}"
                 fi
+                if [ -n "${{ github.event.inputs.custom_branch }}" ] ; then
+                        EXTRA+=" --custom-branch ${{ github.event.inputs.custom_branch }}"
+                fi
 
                 .github/workflows/scripts/qemu-4-build.sh $EXTRA \
                         --repo --release --dkms --tarball ${{ matrix.os }}
+
+                if [ -n "${{ github.event.inputs.custom_branch }}" ] ; then
+                        echo "Built packages for ${{ github.event.inputs.custom_branch }}"
+                fi
         fi
 
     - name: Prepare artifacts

From 971791762a7eedd543f753ca06cca6db2f600d12 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Fri, 22 May 2026 18:18:39 +0200
Subject: [PATCH 063/129] CI: enable FreeBSD 15.0-RELEASE in matrix

Add freebsd15-0r to the FreeBSD presets

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18561
---
 .github/workflows/scripts/qemu-2-start.sh | 2 +-
 .github/workflows/zfs-qemu.yml            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 3eba651ae6d..9770c8903cc 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -98,7 +98,7 @@ case "$OS" in
   freebsd15-0r)
     FreeBSD="15.0-RELEASE"
     OSNAME="FreeBSD $FreeBSD"
-    OSv="freebsd15.0"
+    OSv="freebsd14.0"
     URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
     KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
     ;;
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 500e54ad84f..9f251beac9e 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -55,11 +55,11 @@ jobs:
             os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24", "ubuntu26"]'
             ;;
           freebsd)
-            os_selection='["freebsd14-4r", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]'
+            os_selection='["freebsd14-4r", "freebsd14-4s", "freebsd15-0r", "freebsd15-1s", "freebsd16-0c"]'
             ;;
           *)
             # default list
-            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24", "ubuntu26"]'
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-0r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24", "ubuntu26"]'
             ;;
           esac
 

From 1916c2c55280f5a01f16e901f213d686eb0af93b Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 22 May 2026 12:01:22 -0700
Subject: [PATCH 064/129] CI: skip full CI runs on push events

Full CI runs for proposed changes always occur in the PR where the
review is done and patch approved.  Once merged the full CI is run
again using the merged commit.  This is somewhat overkill.  In the
interest of reducing the CI load only run the zloop and checkstyle
workflows which are enough to verify the build on the master branch.
Push events to forks will continue to trigger a full CI run.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18571
---
 .github/workflows/codeql.yml   |  1 +
 .github/workflows/smatch.yml   |  1 +
 .github/workflows/zfs-arm.yml  |  1 +
 .github/workflows/zfs-qemu.yml | 11 +++++++++--
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 689fe71fddc..04ad7fae711 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -11,6 +11,7 @@ concurrency:
 jobs:
   analyze:
     name: Analyze
+    if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
     runs-on: ubuntu-22.04
     permissions:
       actions: read
diff --git a/.github/workflows/smatch.yml b/.github/workflows/smatch.yml
index 305a1f0179b..ffad83b64ea 100644
--- a/.github/workflows/smatch.yml
+++ b/.github/workflows/smatch.yml
@@ -10,6 +10,7 @@ concurrency:
 
 jobs:
   smatch:
+    if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
     runs-on: ubuntu-24.04
     steps:
     - name: Checkout smatch
diff --git a/.github/workflows/zfs-arm.yml b/.github/workflows/zfs-arm.yml
index b6d6444c2dd..fb91f198f6d 100644
--- a/.github/workflows/zfs-arm.yml
+++ b/.github/workflows/zfs-arm.yml
@@ -14,6 +14,7 @@ on:
 jobs:
   zfs-arm:
     name: ZFS ARM build
+    if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
     runs-on: ubuntu-24.04-arm
     steps:
     - uses: actions/checkout@v6
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index 9f251beac9e..64ffee484a5 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -23,6 +23,7 @@ concurrency:
 jobs:
   test-config:
     name: Setup
+    if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
     runs-on: ubuntu-24.04
     outputs:
       test_os: ${{ steps.os.outputs.os }}
@@ -94,7 +95,10 @@ jobs:
   qemu-vm:
     name: qemu-x86
     needs: [ test-config ]
-    if: needs.test-config.outputs.ci_type != 'docs'
+    if: >-
+      (github.event_name == 'pull_request' ||
+      github.repository != 'openzfs/zfs') &&
+      needs.test-config.outputs.ci_type != 'docs'
     strategy:
       fail-fast: false
       matrix:
@@ -157,7 +161,10 @@ jobs:
       run: .github/workflows/scripts/qemu-8-summary.sh '${{ steps.artifact-upload.outputs.artifact-url }}'
 
   cleanup:
-    if: always()
+    if: >-
+      (github.event_name == 'pull_request' ||
+      github.repository != 'openzfs/zfs') &&
+      always()
     name: Cleanup
     runs-on: ubuntu-latest
     needs: [ qemu-vm ]

From accb2b418e983ced71b136efe1eb4d57821a2a80 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Fri, 22 May 2026 22:15:31 +0200
Subject: [PATCH 065/129] CI: run full CI when a workflow YAML changes

FULL_RUN_REGEX in generate-ci-type.py covered .github/workflows/scripts/
but not the workflow YAML files, so a PR that only edited zfs-qemu.yml
got "quick" CI and never tested its own matrix change. Add the YAML
files to the list.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18577
---
 .github/workflows/scripts/generate-ci-type.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/scripts/generate-ci-type.py b/.github/workflows/scripts/generate-ci-type.py
index 08f0c0fcc9a..4862cc16139 100755
--- a/.github/workflows/scripts/generate-ci-type.py
+++ b/.github/workflows/scripts/generate-ci-type.py
@@ -48,6 +48,7 @@
 Patterns of files that are considered to trigger full CI.
 """
 FULL_RUN_REGEX = list(map(re.compile, [
+    r'\.github/workflows/.*\.ya?ml',
     r'\.github/workflows/scripts/.*',
     r'cmd.*',
     r'configs/.*',

From 82b33c00347d8cce836b8f36be10035588c5ba23 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 2 May 2026 17:37:12 +1000
Subject: [PATCH 066/129] unit: a unit testing framework

This commit establishes a unit test framework for OpenZFS, and
integrates it into the build.

It includes:
- the "munit" unit test framework (munit.c, munit.h)
- some light extensions to munit and glue for OpenZFS (unit.c, unit.h)
- make targets for running tests and generating coverage reports
- a document explaining the what, how and why

This is a first step; I expect we will extend all of this as we use it
more places and gain experience with it.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18564
---
 Makefile.am            |    1 +
 tests/Makefile.am      |    1 +
 tests/unit/.gitignore  |    2 +
 tests/unit/Makefile.am |   60 +
 tests/unit/README.md   |  189 +++
 tests/unit/munit.c     | 2458 ++++++++++++++++++++++++++++++++++++++++
 tests/unit/munit.h     |  575 ++++++++++
 tests/unit/unit.c      |   85 ++
 tests/unit/unit.h      |   60 +
 9 files changed, 3431 insertions(+)
 create mode 100644 tests/unit/.gitignore
 create mode 100644 tests/unit/Makefile.am
 create mode 100644 tests/unit/README.md
 create mode 100644 tests/unit/munit.c
 create mode 100644 tests/unit/munit.h
 create mode 100644 tests/unit/unit.c
 create mode 100644 tests/unit/unit.h

diff --git a/Makefile.am b/Makefile.am
index 73382f86e6f..c1638aa4288 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -138,6 +138,7 @@ cstyle:
 		! -path './include/sys/lua/*' \
 		! -path './module/lua/l*.[ch]' \
 		! -path './module/zfs/lz4.c' \
+		! -path './tests/unit/munit.[ch]' \
 		$(cstyle_line)
 
 filter_executable = -exec test -x '{}' \; -print
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b007a3d7e5f..2002ced658c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: CDDL-1.0
+include $(srcdir)/%D%/unit/Makefile.am
 include $(srcdir)/%D%/zfs-tests/Makefile.am
 
 
diff --git a/tests/unit/.gitignore b/tests/unit/.gitignore
new file mode 100644
index 00000000000..52315f0b5fa
--- /dev/null
+++ b/tests/unit/.gitignore
@@ -0,0 +1,2 @@
+/test_*.info
+/test_*_coverage
diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
new file mode 100644
index 00000000000..1ba1258143e
--- /dev/null
+++ b/tests/unit/Makefile.am
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: CDDL-1.0
+
+# libunit.la includes munit and any additional tools that apply to all tests
+libunit_la_CFLAGS = $(AM_CFLAGS)
+
+noinst_LTLIBRARIES += libunit.la
+libunit_la_SOURCES = \
+	%D%/munit.c \
+	%D%/munit.h \
+	%D%/unit.c \
+	%D%/unit.h
+
+
+# all test binaries
+UNIT_TESTS =
+noinst_PROGRAMS = $(UNIT_TESTS)
+
+
+# test run and coverage targets below
+PHONY += unit unit-coverage
+
+_unit_run_%: %D%/%
+	@echo "  UNITTEST $<" ; $<
+
+_unit_coverage_%: _unit_run_%
+	@${LCOV} --quiet --capture  \
+		--test-name $(subst _unit_coverage_, , $@) \
+		--directory $(top_srcdir) \
+		--output-file \
+			%D%/$(join $(subst _unit_coverage_, , $@), .info) \
+		$(addprefix --include , $(call $(join \
+			$(subst _unit_coverage_, nodist_%C%_, $@), _SOURCES)))
+	@${GENHTML} --quiet \
+		%D%/$(join $(subst _unit_coverage_, , $@), .info) \
+		--output-directory \
+			%D%/$(join $(subst _unit_coverage_, , $@), _coverage)
+	@echo "coverage results:" \
+		"file://$(realpath %D%)/$(join $(subst _unit_coverage_, , $@), _coverage)/index.html"
+
+_UNIT_ALL_TARGETS = $(notdir $(UNIT_TESTS))
+_UNIT_FIND_TARGET = \
+	$(foreach cmd, $(UNIT_TESTS), \
+		$(if $(filter $(join test_, $(1)), $(notdir $(cmd))), \
+			$(notdir $(cmd))))
+
+_UNIT_TARGETS = $(if $(T), \
+	$(call _UNIT_FIND_TARGET, $(T)), $(call _UNIT_ALL_TARGETS))
+
+unit: $(addprefix _unit_run_, $(_UNIT_TARGETS))
+	@$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false)
+
+if CODE_COVERAGE_ENABLED
+unit-coverage: $(addprefix _unit_coverage_, $(_UNIT_TARGETS))
+	@$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false)
+else
+unit-coverage:
+	@echo "unit test coverage not enabled."
+	@echo "re-run configure with --enable-code-coverage"
+	@false
+endif
diff --git a/tests/unit/README.md b/tests/unit/README.md
new file mode 100644
index 00000000000..464c8f1c731
--- /dev/null
+++ b/tests/unit/README.md
@@ -0,0 +1,189 @@
+# Unit tests
+
+> [!NOTE]
+>
+> This document is a draft. It will be updated as we gain experience writing
+> and running unit tests.
+
+This directory contains a unit testing framework for OpenZFS, and a collection
+of unit tests.
+
+## Building and running
+
+The unit tests are built by default as part of the regular userspace build, so
+you probably don’t have to do anything else.
+
+The easiest way to run the tests is to run `make unit`, which will run all the
+available tests.
+
+```
+$ make unit
+  UNITTEST tests/unit/test_zap
+Running test suite with seed 0x9d36890b...
+zap.mock_microzap_sanity             [ OK    ] [ 0.00001088 / 0.00000939 CPU ]
+zap.mock_fatzap_sanity               [ OK    ] [ 0.00004281 / 0.00004257 CPU ]
+zap.zap_basic
+  type=micro                         [ OK    ] [ 0.00001899 / 0.00001893 CPU ]
+  type=fat                           [ OK    ] [ 0.00004174 / 0.00004135 CPU ]
+4 of 4 (100%) tests successful, 0 (0%) test skipped.
+```
+
+Running a single test binary is possible with the `T=` param to `make unit`.
+
+```
+$ make unit T=zap
+  UNITTEST tests/unit/test_zap
+  ...
+```
+
+The test binaries are just normal programs in `./tests/unit`, and can be run
+directly. This is useful for debugging with `gdb`.
+
+```
+$ ./tests/unit/test_zap
+Running test suite with seed 0x18e131ac...
+...
+```
+
+### Building just for tests
+
+Recommended “minimum” build for just the unit tests, with additional debug to
+assist with understanding issues.
+
+```
+./configure \
+	--with-config=user \
+	--enable-debug --enable-debuginfo \
+	--disable-sysvinit --disable-systemd --disable-pam --disable-pyzfs
+make -j$(nproc)
+```
+
+TODO: add `--with-config=unit` that disables _everything_ not needed for the tests
+
+### Generating a coverage report
+
+If `configure` was run with `--enable-code-coverage`, then an additional
+`unit-coverage` target is available, which will run the requested tests, then
+run `lcov` and `genhtml` to produce a HTML coverage report:
+
+```
+$ make unit-coverage T=zap
+  UNITTEST tests/unit/test_zap
+Running test suite with seed 0xe461208d...
+zap.mock_microzap_sanity             [ OK    ] [ 0.00000933 / 0.00000773 CPU ]
+zap.mock_fatzap_sanity               [ OK    ] [ 0.00004685 / 0.00004612 CPU ]
+zap.zap_basic
+  type=micro                         [ OK    ] [ 0.00002579 / 0.00002484 CPU ]
+  type=fat                           [ OK    ] [ 0.00004093 / 0.00004038 CPU ]
+4 of 4 (100%) tests successful, 0 (0%) test skipped.
+lcov: WARNING: (inconsistent) /home/robn/code/zfs-unit/module/zfs/u8_textprep.c:1104: unexecuted block on non-branch line with non-zero hit count.  Use "geninfo --rc geninfo_unexecuted_blocks=1 to set count to zero.
+	(use "lcov --ignore-errors inconsistent,inconsistent ..." to suppress this warning)
+Message summary:
+  1 warning message:
+    inconsistent: 1
+Overall coverage rate:
+  source files: 6
+  lines.......: 42.3% (1270 of 3002 lines)
+  functions...: 42.0% (76 of 181 functions)
+Message summary:
+  no messages were reported
+coverage results: file://tests/unit/test_zap_coverage/index.html
+```
+
+TODO: improve the overall structure to make this less noisy.
+
+## Guidance for test writers
+
+### Top five
+
+* Only bring in the source files under test.
+* Use mocks to create the test scenario, then interrogate them to understand the result.
+* Prefer more smaller tests over fewer bigger ones.
+* Use coverage reports to guide test development.
+* Do the simplest possible thing.
+
+### Test structure
+
+Tests should be as simple and as readable as possible. When a test fails, we
+want to avoid the possibility that it could be the test itself at fault rather
+than the system under test.
+
+* Aim for one source file per subsystem or source concept (eg ZAP).
+* Aim for one test function per API call or logical behaviour
+  * Each “version” or “mode” of an API call or behaviour is a separate test
+  * Don’t test more than one thing in the same test; a test shouldn’t rely on
+    state or results from an earlier test
+* Use test parameters for “class“ or ”vtable” -type APIs, where each
+  implementation should respond to API calls the same way
+
+### Build system
+
+The build setup `tests/unit/Makefile.am` is very similar to the other
+userspace, however it has a couple of differences to make the run and coverage
+targets work more smoothly.
+
+* Name the test program `test_foo`. Almost always, you will have one source
+  file with the actual tests in it, called `test_foo.c`.
+* Add the program to `UNIT_TESTS`. `noinst_PROGRAMS` will be populated from it,
+  but this gives a specific name the run and coverage targets can use to
+  resolve the `T=` parameter to a specific test.
+* List the source files under test in `nodist_%C%_test_foo_SOURCES`, and the
+  source files for the test itself in `%C%_test_foo_SOURCES`. This is
+  important, as the coverage targets use `nodist_%C%_ ... _SOURCES` as the list
+  of objects to include in the coverage output.
+
+### Mocks
+
+A “mock” struct is a fake version of some data structure that the subsystem
+under test will accept and use as though it was a real one.
+
+* Make mock structs opaque. All uses from the test suite should be through
+  specific named accessor functions.
+* Name a mock struct for the struct it is mimicking, prefixed with `mock_`. eg
+  `mock_dnode_t` is the mock for `dnode_t`.
+* Access functions should be named for the struct, eg the function to create a
+  `mock_dnode_t` is `mock_dnode_t *mock_dnode_create(...)`.
+* `mock_*` functions should always use the mock type name in its signature,
+  never the original.
+* The mock object should always be directly castable to its real type and
+  vice-versa, ie a `mock_dnode_t *`   is always usable wherever a  `dnode_t *`
+  is (within the domain of the subsystem under test).
+
+This guidance pushes the programmer towards being explicit at the possible
+expense of concision. This is in service of keeping the tests reliable; in
+particular, if mocks require explicit casting to use, then there’s far less
+chance of either a mock or a real object being used incorrectly in the test,
+which can be confusing.
+
+### Unit testing framework
+
+[µnit](https://nemequ.github.io/munit/) (aka munit) is the unit test framework.
+It is a relatively niche choice, and arguably abandoned by upstream, but is
+well constructed with a thoughtful feature set and some useful properties:
+
+* Just two source files we can easily carry in the repo.
+* Portable, including to Windows.
+* Each test is run in a forked process, so a test failure will not corrupt the
+  rest of the test suite run
+* Parameterised tests.
+* A large suite of assertions and other useful functions that make it easy to
+  integrate with.
+
+All OpenZFS unit tests are ultimately targeting munit, so its expected that
+they will use various features as needed. However, we also supply our own
+facilities to extend those in useful ways.
+
+#### Local extensions
+
+`unit.h` provides a handful of macros. The majority of these are aliases for
+the much longer munit names for same function, eg `unit_true(n)` is an alias
+for `munit_assert_true(n)`, `unit_eq(a,b)` is an alias for
+`munit_assert_uint64(a, ==, b)`, and so on. These are there so that the
+assertions do not dominate the test visually, as we want it to be easier to
+focus on the details.
+
+Similarly, the `UINT_TEST` and `UNIT_PARAM` macros exist to help with test
+definition, as the casts are a little complicated.
+
+The goal is to keep this set relatively small, but all of munit is there for
+use, so do extend it if necessary.
diff --git a/tests/unit/munit.c b/tests/unit/munit.c
new file mode 100644
index 00000000000..73d32728e8c
--- /dev/null
+++ b/tests/unit/munit.c
@@ -0,0 +1,2458 @@
+// SPDX-License-Identifier: MIT
+/* µnit Testing Framework
+ * Copyright (c) 2013-2018 Evan Nemerson <evan@nemerson.com>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*** Configuration ***/
+
+/* This is just where the output from the test goes.  It's really just
+ * meant to let you choose stdout or stderr, but if anyone really want
+ * to direct it to a file let me know, it would be fairly easy to
+ * support. */
+#if !defined(MUNIT_OUTPUT_FILE)
+#  define MUNIT_OUTPUT_FILE stdout
+#endif
+
+/* This is a bit more useful; it tells µnit how to format the seconds in
+ * timed tests.  If your tests run for longer you might want to reduce
+ * it, and if your computer is really fast and your tests are tiny you
+ * can increase it. */
+#if !defined(MUNIT_TEST_TIME_FORMAT)
+#  define MUNIT_TEST_TIME_FORMAT "0.8f"
+#endif
+
+/* If you have long test names you might want to consider bumping
+ * this.  The result information takes 43 characters. */
+#if !defined(MUNIT_TEST_NAME_LEN)
+#  define MUNIT_TEST_NAME_LEN 37
+#endif
+
+/* If you don't like the timing information, you can disable it by
+ * defining MUNIT_DISABLE_TIMING. */
+#if !defined(MUNIT_DISABLE_TIMING)
+#  define MUNIT_ENABLE_TIMING
+#endif
+
+/* OpenZFS: claim no strerror_r, causing munit to use its own internal
+ * fallback. There are two version of strerror_r (XSI and GNU), subtly
+ * different, and some glibc versions have warn_unused_result set on the
+ * prototype. munit is not prepared for this variance, so better just to
+ * let it do its own thing. -- robn, 2026-05-21 */
+#if !defined(MUNIT_NO_STRERROR_R)
+#  define MUNIT_NO_STRERROR_R
+#endif
+
+/*** End configuration ***/
+
+#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L)
+#  undef _POSIX_C_SOURCE
+#endif
+#if !defined(_POSIX_C_SOURCE)
+#  define _POSIX_C_SOURCE 200809L
+#endif
+
+/* Solaris freaks out if you try to use a POSIX or SUS standard without
+ * the "right" C standard. */
+#if defined(_XOPEN_SOURCE)
+#  undef _XOPEN_SOURCE
+#endif
+
+#if defined(__STDC_VERSION__)
+#  if __STDC_VERSION__ >= 201112L
+#    define _XOPEN_SOURCE 700
+#  elif __STDC_VERSION__ >= 199901L
+#    define _XOPEN_SOURCE 600
+#  endif
+#endif
+
+/* Because, according to Microsoft, POSIX is deprecated.  You've got
+ * to appreciate the chutzpah. */
+#if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE)
+#  define _CRT_NONSTDC_NO_DEPRECATE
+#endif
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#  include <stdbool.h>
+#elif defined(_WIN32)
+/* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */
+#endif
+
+#include <limits.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <setjmp.h>
+
+#if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32)
+#  define MUNIT_NL_LANGINFO
+#  include <locale.h>
+#  include <langinfo.h>
+#  include <strings.h>
+#endif
+
+#if !defined(_WIN32)
+#  include <unistd.h>
+#  include <sys/types.h>
+#  include <sys/wait.h>
+#else
+#  include <windows.h>
+#  include <io.h>
+#  include <fcntl.h>
+#  if !defined(STDERR_FILENO)
+#    define STDERR_FILENO _fileno(stderr)
+#  endif
+#endif
+
+#include "munit.h"
+
+#define MUNIT_STRINGIFY(x) #x
+#define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x)
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) ||  \
+  defined(__IBMCPP__)
+#  define MUNIT_THREAD_LOCAL __thread
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) ||          \
+  defined(_Thread_local)
+#  define MUNIT_THREAD_LOCAL _Thread_local
+#elif defined(_WIN32)
+#  define MUNIT_THREAD_LOCAL __declspec(thread)
+#endif
+
+/* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... }
+ * while (0)', or 'do { ... } while (1)'.  I'm pretty sure nobody
+ * at Microsoft compiles with /W4. */
+#if defined(_MSC_VER) && (_MSC_VER <= 1800)
+#  pragma warning(disable : 4127)
+#endif
+
+#if defined(_WIN32) || defined(__EMSCRIPTEN__)
+#  define MUNIT_NO_FORK
+#endif
+
+#if defined(__EMSCRIPTEN__)
+#  define MUNIT_NO_BUFFER
+#endif
+
+/*** Logging ***/
+
+static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO;
+static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR;
+
+#if defined(MUNIT_THREAD_LOCAL)
+static MUNIT_THREAD_LOCAL munit_bool munit_error_jmp_buf_valid = 0;
+static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf;
+#endif
+
+/* At certain warning levels, mingw will trigger warnings about
+ * suggesting the format attribute, which we've explicity *not* set
+ * because it will then choke on our attempts to use the MS-specific
+ * I64 modifier for size_t (which we have to use since MSVC doesn't
+ * support the C99 z modifier). */
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
+
+MUNIT_PRINTF(5, 0)
+static void munit_logf_exv(MunitLogLevel level, FILE *fp, const char *filename,
+                           int line, const char *format, va_list ap) {
+  if (level < munit_log_level_visible)
+    return;
+
+  switch (level) {
+  case MUNIT_LOG_DEBUG:
+    fputs("Debug", fp);
+    break;
+  case MUNIT_LOG_INFO:
+    fputs("Info", fp);
+    break;
+  case MUNIT_LOG_WARNING:
+    fputs("Warning", fp);
+    break;
+  case MUNIT_LOG_ERROR:
+    fputs("Error", fp);
+    break;
+  default:
+    munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)",
+                  level);
+    return;
+  }
+
+  fputs(": ", fp);
+  if (filename != NULL)
+    fprintf(fp, "%s:%d: ", filename, line);
+  vfprintf(fp, format, ap);
+  fputc('\n', fp);
+}
+
+MUNIT_PRINTF(3, 4)
+static void munit_logf_internal(MunitLogLevel level, FILE *fp,
+                                const char *format, ...) {
+  va_list ap;
+
+  va_start(ap, format);
+  munit_logf_exv(level, fp, NULL, 0, format, ap);
+  va_end(ap);
+}
+
+static void munit_log_internal(MunitLogLevel level, FILE *fp,
+                               const char *message) {
+  munit_logf_internal(level, fp, "%s", message);
+}
+
+void munit_logf_ex(MunitLogLevel level, const char *filename, int line,
+                   const char *format, ...) {
+  va_list ap;
+
+  va_start(ap, format);
+  munit_logf_exv(level, stderr, filename, line, format, ap);
+  va_end(ap);
+
+  if (level >= munit_log_level_fatal) {
+#if defined(MUNIT_THREAD_LOCAL)
+    if (munit_error_jmp_buf_valid)
+      longjmp(munit_error_jmp_buf, 1);
+#endif
+    abort();
+  }
+}
+
+void munit_errorf_ex(const char *filename, int line, const char *format, ...) {
+  va_list ap;
+
+  va_start(ap, format);
+  munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap);
+  va_end(ap);
+
+#if defined(MUNIT_THREAD_LOCAL)
+  if (munit_error_jmp_buf_valid)
+    longjmp(munit_error_jmp_buf, 1);
+#endif
+  abort();
+}
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+#  pragma GCC diagnostic pop
+#endif
+
+#if !defined(MUNIT_STRERROR_LEN)
+#  define MUNIT_STRERROR_LEN 80
+#endif
+
+static void munit_log_errno(MunitLogLevel level, FILE *fp, const char *msg) {
+#if defined(MUNIT_NO_STRERROR_R) ||                                            \
+  (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API))
+  munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno);
+#else
+  char munit_error_str[MUNIT_STRERROR_LEN];
+  munit_error_str[0] = '\0';
+
+#  if !defined(_WIN32)
+  strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN);
+#  else
+  strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno);
+#  endif
+
+  munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno);
+#endif
+}
+
+/*** Memory allocation ***/
+
+void *munit_malloc_ex(const char *filename, int line, size_t size) {
+  void *ptr;
+
+  if (size == 0)
+    return NULL;
+
+  ptr = calloc(1, size);
+  if (MUNIT_UNLIKELY(ptr == NULL)) {
+    munit_logf_ex(MUNIT_LOG_ERROR, filename, line,
+                  "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size);
+  }
+
+  return ptr;
+}
+
+/*** Timer code ***/
+
+#if defined(MUNIT_ENABLE_TIMING)
+
+#  define psnip_uint64_t munit_uint64_t
+#  define psnip_uint32_t munit_uint32_t
+
+/* Code copied from portable-snippets
+ * <https://github.com/nemequ/portable-snippets/>.  If you need to
+ * change something, please do it there so we can keep the code in
+ * sync. */
+
+/* Clocks (v1)
+ * Portable Snippets - https://gitub.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#  if !defined(PSNIP_CLOCK_H)
+#    define PSNIP_CLOCK_H
+
+#    if !defined(psnip_uint64_t)
+#      include "../exact-int/exact-int.h"
+#    endif
+
+#    if !defined(PSNIP_CLOCK_STATIC_INLINE)
+#      if defined(__GNUC__)
+#        define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__))
+#      else
+#        define PSNIP_CLOCK__COMPILER_ATTRIBUTES
+#      endif
+
+#      define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static
+#    endif
+
+enum PsnipClockType {
+  /* This clock provides the current time, in units since 1970-01-01
+   * 00:00:00 UTC not including leap seconds.  In other words, UNIX
+   * time.  Keep in mind that this clock doesn't account for leap
+   * seconds, and can go backwards (think NTP adjustments). */
+  PSNIP_CLOCK_TYPE_WALL = 1,
+  /* The CPU time is a clock which increases only when the current
+   * process is active (i.e., it doesn't increment while blocking on
+   * I/O). */
+  PSNIP_CLOCK_TYPE_CPU = 2,
+  /* Monotonic time is always running (unlike CPU time), but it only
+     ever moves forward unless you reboot the system.  Things like NTP
+     adjustments have no effect on this clock. */
+  PSNIP_CLOCK_TYPE_MONOTONIC = 3
+};
+
+struct PsnipClockTimespec {
+  psnip_uint64_t seconds;
+  psnip_uint64_t nanoseconds;
+};
+
+/* Methods we support: */
+
+#    define PSNIP_CLOCK_METHOD_CLOCK_GETTIME 1
+#    define PSNIP_CLOCK_METHOD_TIME 2
+#    define PSNIP_CLOCK_METHOD_GETTIMEOFDAY 3
+#    define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER 4
+#    define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME 5
+#    define PSNIP_CLOCK_METHOD_CLOCK 6
+#    define PSNIP_CLOCK_METHOD_GETPROCESSTIMES 7
+#    define PSNIP_CLOCK_METHOD_GETRUSAGE 8
+#    define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME 9
+#    define PSNIP_CLOCK_METHOD_GETTICKCOUNT64 10
+
+#    include <assert.h>
+
+#    if defined(HEDLEY_UNREACHABLE)
+#      define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE()
+#    else
+#      define PSNIP_CLOCK_UNREACHABLE() assert(0)
+#    endif
+
+/* Choose an implementation */
+
+/* #undef PSNIP_CLOCK_WALL_METHOD */
+/* #undef PSNIP_CLOCK_CPU_METHOD */
+/* #undef PSNIP_CLOCK_MONOTONIC_METHOD */
+
+/* We want to be able to detect the libc implementation, so we include
+   <limits.h> (<features.h> isn't available everywhere). */
+
+#    if defined(__unix__) || defined(__unix) || defined(__linux__)
+#      include <limits.h>
+#      include <unistd.h>
+#    endif
+
+#    if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
+/* These are known to work without librt.  If you know of others
+ * please let us know so we can add them. */
+#      if (defined(__GLIBC__) &&                                               \
+           (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) ||    \
+        (defined(__FreeBSD__))
+#        define PSNIP_CLOCK_HAVE_CLOCK_GETTIME
+#      elif !defined(PSNIP_CLOCK_NO_LIBRT)
+#        define PSNIP_CLOCK_HAVE_CLOCK_GETTIME
+#      endif
+#    endif
+
+#    if defined(_WIN32)
+#      if !defined(PSNIP_CLOCK_CPU_METHOD)
+#        define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES
+#      endif
+#      if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+#        define PSNIP_CLOCK_MONOTONIC_METHOD                                   \
+          PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
+#      endif
+#    endif
+
+#    if defined(__MACH__) && !defined(__gnu_hurd__)
+#      if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+#        define PSNIP_CLOCK_MONOTONIC_METHOD                                   \
+          PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
+#      endif
+#    endif
+
+#    if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME)
+#      include <time.h>
+#      if !defined(PSNIP_CLOCK_WALL_METHOD)
+#        if defined(CLOCK_REALTIME_PRECISE)
+#          define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE
+#        elif !defined(__sun)
+#          define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME
+#        endif
+#      endif
+#      if !defined(PSNIP_CLOCK_CPU_METHOD)
+#        if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID)
+#          define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID
+#        elif defined(CLOCK_VIRTUAL)
+#          define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL
+#        endif
+#      endif
+#      if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+#        if defined(CLOCK_MONOTONIC_RAW)
+#          define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC
+#        elif defined(CLOCK_MONOTONIC_PRECISE)
+#          define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE
+#        elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC)
+#          define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#          define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC
+#        endif
+#      endif
+#    endif
+
+#    if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L)
+#      if !defined(PSNIP_CLOCK_WALL_METHOD)
+#        define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY
+#      endif
+#    endif
+
+#    if !defined(PSNIP_CLOCK_WALL_METHOD)
+#      define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME
+#    endif
+
+#    if !defined(PSNIP_CLOCK_CPU_METHOD)
+#      define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK
+#    endif
+
+/* Primarily here for testing. */
+#    if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                              \
+      defined(PSNIP_CLOCK_REQUIRE_MONOTONIC)
+#      error No monotonic clock found.
+#    endif
+
+/* Implementations */
+
+#    if (defined(PSNIP_CLOCK_CPU_METHOD) &&                                    \
+         (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) ||      \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) ||       \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) ||  \
+      (defined(PSNIP_CLOCK_CPU_METHOD) &&                                      \
+       (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) ||                \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) ||               \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) ||          \
+      (defined(PSNIP_CLOCK_CPU_METHOD) &&                                      \
+       (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_TIME)) ||                 \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME)) ||                \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME))
+#      include <time.h>
+#    endif
+
+#    if (defined(PSNIP_CLOCK_CPU_METHOD) &&                                    \
+         (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) ||       \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) ||        \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY))
+#      include <sys/time.h>
+#    endif
+
+#    if (defined(PSNIP_CLOCK_CPU_METHOD) &&                                    \
+         (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) ||    \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) ||     \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD ==                                        \
+        PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) ||                                \
+      (defined(PSNIP_CLOCK_CPU_METHOD) &&                                      \
+       (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) ||       \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) ||      \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64))
+#      include <windows.h>
+#    endif
+
+#    if (defined(PSNIP_CLOCK_CPU_METHOD) &&                                    \
+         (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) ||          \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) ||           \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE))
+#      include <sys/time.h>
+#      include <sys/resource.h>
+#    endif
+
+#    if (defined(PSNIP_CLOCK_CPU_METHOD) &&                                    \
+         (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) ||  \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD ==                                        \
+        PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME))
+#      include <CoreServices/CoreServices.h>
+#      include <mach/mach.h>
+#      include <mach/mach_time.h>
+#    endif
+
+/*** Implementations ***/
+
+#    define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t)(1000000000ULL))
+
+#    if (defined(PSNIP_CLOCK_CPU_METHOD) &&                                    \
+         (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) ||      \
+      (defined(PSNIP_CLOCK_WALL_METHOD) &&                                     \
+       (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) ||       \
+      (defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                                \
+       (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME))
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock__clock_getres(clockid_t clk_id) {
+  struct timespec res;
+  int r;
+
+  r = clock_getres(clk_id, &res);
+  if (r != 0)
+    return 0;
+
+  return (psnip_uint32_t)(PSNIP_CLOCK_NSEC_PER_SEC /
+                          (psnip_uint64_t)res.tv_nsec);
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock__clock_gettime(clockid_t clk_id, struct PsnipClockTimespec *res) {
+  struct timespec ts;
+
+  if (clock_gettime(clk_id, &ts) != 0)
+    return -10;
+
+  res->seconds = (psnip_uint64_t)(ts.tv_sec);
+  res->nanoseconds = (psnip_uint64_t)(ts.tv_nsec);
+
+  return 0;
+}
+#    endif
+
+PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_wall_get_precision(void) {
+#    if !defined(PSNIP_CLOCK_WALL_METHOD)
+  return 0;
+#    elif defined(PSNIP_CLOCK_WALL_METHOD) &&                                  \
+      PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL);
+#    elif defined(PSNIP_CLOCK_WALL_METHOD) &&                                  \
+      PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY
+  return 1000000;
+#    elif defined(PSNIP_CLOCK_WALL_METHOD) &&                                  \
+      PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME
+  return 1;
+#    else
+  return 0;
+#    endif
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_wall_get_time(struct PsnipClockTimespec *res) {
+#    if !defined(PSNIP_CLOCK_WALL_METHOD)
+  (void)res;
+
+  return -2;
+#    elif defined(PSNIP_CLOCK_WALL_METHOD) &&                                  \
+      PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res);
+#    elif defined(PSNIP_CLOCK_WALL_METHOD) &&                                  \
+      PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME
+  res->seconds = time(NULL);
+  res->nanoseconds = 0;
+#    elif defined(PSNIP_CLOCK_WALL_METHOD) &&                                  \
+      PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY
+  struct timeval tv;
+
+  if (gettimeofday(&tv, NULL) != 0)
+    return -6;
+
+  res->seconds = (psnip_uint64_t)tv.tv_sec;
+  res->nanoseconds = (psnip_uint64_t)tv.tv_usec * 1000;
+#    else
+  (void)res;
+
+  return -2;
+#    endif
+
+  return 0;
+}
+
+PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_cpu_get_precision(void) {
+#    if !defined(PSNIP_CLOCK_CPU_METHOD)
+  return 0;
+#    elif defined(PSNIP_CLOCK_CPU_METHOD) &&                                   \
+      PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU);
+#    elif defined(PSNIP_CLOCK_CPU_METHOD) &&                                   \
+      PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK
+  return CLOCKS_PER_SEC;
+#    elif defined(PSNIP_CLOCK_CPU_METHOD) &&                                   \
+      PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES
+  return PSNIP_CLOCK_NSEC_PER_SEC / 100;
+#    else
+  return 0;
+#    endif
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_cpu_get_time(struct PsnipClockTimespec *res) {
+#    if !defined(PSNIP_CLOCK_CPU_METHOD)
+  (void)res;
+  return -2;
+#    elif defined(PSNIP_CLOCK_CPU_METHOD) &&                                   \
+      PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res);
+#    elif defined(PSNIP_CLOCK_CPU_METHOD) &&                                   \
+      PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK
+  clock_t t = clock();
+  if (t == ((clock_t)-1))
+    return -5;
+  res->seconds = t / CLOCKS_PER_SEC;
+  res->nanoseconds =
+    (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC);
+#    elif defined(PSNIP_CLOCK_CPU_METHOD) &&                                   \
+      PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES
+  FILETIME CreationTime, ExitTime, KernelTime, UserTime;
+  LARGE_INTEGER date, adjust;
+
+  if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime,
+                       &KernelTime, &UserTime))
+    return -7;
+
+  /* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */
+  date.HighPart = (LONG)UserTime.dwHighDateTime;
+  date.LowPart = UserTime.dwLowDateTime;
+  adjust.QuadPart = 11644473600000 * 10000;
+  date.QuadPart -= adjust.QuadPart;
+
+  res->seconds = (psnip_uint64_t)(date.QuadPart / 10000000);
+  res->nanoseconds = (psnip_uint64_t)(date.QuadPart % 10000000) *
+                     (PSNIP_CLOCK_NSEC_PER_SEC / 100);
+#    elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE
+  struct rusage usage;
+  if (getrusage(RUSAGE_SELF, &usage) != 0)
+    return -8;
+
+  res->seconds = usage.ru_utime.tv_sec;
+  res->nanoseconds = tv.tv_usec * 1000;
+#    else
+  (void)res;
+  return -2;
+#    endif
+
+  return 0;
+}
+
+PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_monotonic_get_precision(void) {
+#    if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+  return 0;
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC);
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
+  static mach_timebase_info_data_t tbi = {
+    0,
+  };
+  if (tbi.denom == 0)
+    mach_timebase_info(&tbi);
+  return (psnip_uint32_t)(tbi.numer / tbi.denom);
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64
+  return 1000;
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD ==                                          \
+        PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
+  LARGE_INTEGER Frequency;
+  QueryPerformanceFrequency(&Frequency);
+  return (psnip_uint32_t)((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC)
+                            ? PSNIP_CLOCK_NSEC_PER_SEC
+                            : Frequency.QuadPart);
+#    else
+  return 0;
+#    endif
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_monotonic_get_time(struct PsnipClockTimespec *res) {
+#    if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+  (void)res;
+  return -2;
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res);
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
+  psnip_uint64_t nsec = mach_absolute_time();
+  static mach_timebase_info_data_t tbi = {
+    0,
+  };
+  if (tbi.denom == 0)
+    mach_timebase_info(&tbi);
+  nsec *= ((psnip_uint64_t)tbi.numer) / ((psnip_uint64_t)tbi.denom);
+  res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC;
+  res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC;
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD ==                                          \
+        PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
+  LARGE_INTEGER t, f;
+  if (QueryPerformanceCounter(&t) == 0)
+    return -12;
+
+  QueryPerformanceFrequency(&f);
+  res->seconds = (psnip_uint64_t)(t.QuadPart / f.QuadPart);
+  res->nanoseconds = (psnip_uint64_t)(t.QuadPart % f.QuadPart);
+  if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC)
+    res->nanoseconds /= (psnip_uint64_t)f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC;
+  else
+    res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / (psnip_uint64_t)f.QuadPart;
+#    elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) &&                             \
+      PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64
+  const ULONGLONG msec = GetTickCount64();
+  res->seconds = msec / 1000;
+  res->nanoseconds = sec % 1000;
+#    else
+  return -2;
+#    endif
+
+  return 0;
+}
+
+/* Returns the number of ticks per second for the specified clock.
+ * For example, a clock with millisecond precision would return 1000,
+ * and a clock with 1 second (such as the time() function) would
+ * return 1.
+ *
+ * If the requested clock isn't available, it will return 0.
+ * Hopefully this will be rare, but if it happens to you please let us
+ * know so we can work on finding a way to support your system.
+ *
+ * Note that different clocks on the same system often have a
+ * different precisions.
+ */
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock_get_precision(enum PsnipClockType clock_type) {
+  switch (clock_type) {
+  case PSNIP_CLOCK_TYPE_MONOTONIC:
+    return psnip_clock_monotonic_get_precision();
+  case PSNIP_CLOCK_TYPE_CPU:
+    return psnip_clock_cpu_get_precision();
+  case PSNIP_CLOCK_TYPE_WALL:
+    return psnip_clock_wall_get_precision();
+  }
+
+  PSNIP_CLOCK_UNREACHABLE();
+  return 0;
+}
+
+/* Set the provided timespec to the requested time.  Returns 0 on
+ * success, or a negative value on failure. */
+PSNIP_CLOCK__FUNCTION int psnip_clock_get_time(enum PsnipClockType clock_type,
+                                               struct PsnipClockTimespec *res) {
+  assert(res != NULL);
+
+  switch (clock_type) {
+  case PSNIP_CLOCK_TYPE_MONOTONIC:
+    return psnip_clock_monotonic_get_time(res);
+  case PSNIP_CLOCK_TYPE_CPU:
+    return psnip_clock_cpu_get_time(res);
+  case PSNIP_CLOCK_TYPE_WALL:
+    return psnip_clock_wall_get_time(res);
+  }
+
+  return -1;
+}
+
+#  endif /* !defined(PSNIP_CLOCK_H) */
+
+static psnip_uint64_t munit_clock_get_elapsed(struct PsnipClockTimespec *start,
+                                              struct PsnipClockTimespec *end) {
+  psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC;
+  if (end->nanoseconds < start->nanoseconds) {
+    return r - (start->nanoseconds - end->nanoseconds);
+  }
+
+  return r + (end->nanoseconds - start->nanoseconds);
+}
+
+#else
+#  include <time.h>
+#endif /* defined(MUNIT_ENABLE_TIMING) */
+
+/*** PRNG stuff ***/
+
+/* This is (unless I screwed up, which is entirely possible) the
+ * version of PCG with 32-bit state.  It was chosen because it has a
+ * small enough state that we should reliably be able to use CAS
+ * instead of requiring a lock for thread-safety.
+ *
+ * If I did screw up, I probably will not bother changing it unless
+ * there is a significant bias.  It's really not important this be
+ * particularly strong, as long as it is fairly random it's much more
+ * important that it be reproducible, so bug reports have a better
+ * chance of being reproducible. */
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) &&              \
+  !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) &&                 \
+  (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) ||                               \
+   (__GNUC__ == 4 && __GNUC_MINOR__ > 8))
+#  define HAVE_STDATOMIC
+#elif defined(__clang__)
+#  if __has_extension(c_atomic)
+#    define HAVE_CLANG_ATOMICS
+#  endif
+#endif
+
+/* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */
+#if defined(__clang__) && defined(_WIN32)
+#  undef HAVE_STDATOMIC
+#  if defined(__c2__)
+#    undef HAVE_CLANG_ATOMICS
+#  endif
+#endif
+
+#if defined(_OPENMP)
+#  define ATOMIC_UINT32_T uint32_t
+#elif defined(HAVE_STDATOMIC)
+#  include <stdatomic.h>
+#  define ATOMIC_UINT32_T _Atomic uint32_t
+#elif defined(HAVE_CLANG_ATOMICS)
+#  define ATOMIC_UINT32_T _Atomic uint32_t
+#elif defined(_WIN32)
+#  define ATOMIC_UINT32_T volatile LONG
+#else
+#  define ATOMIC_UINT32_T volatile uint32_t
+#endif
+
+static ATOMIC_UINT32_T munit_rand_state = 42;
+
+#if defined(_OPENMP)
+static inline void munit_atomic_store(ATOMIC_UINT32_T *dest,
+                                      ATOMIC_UINT32_T value) {
+#  pragma omp critical(munit_atomics)
+  *dest = value;
+}
+
+static inline uint32_t munit_atomic_load(ATOMIC_UINT32_T *src) {
+  int ret;
+#  pragma omp critical(munit_atomics)
+  ret = *src;
+  return ret;
+}
+
+static inline uint32_t munit_atomic_cas(ATOMIC_UINT32_T *dest,
+                                        ATOMIC_UINT32_T *expected,
+                                        ATOMIC_UINT32_T desired) {
+  munit_bool ret;
+
+#  pragma omp critical(munit_atomics)
+  {
+    if (*dest == *expected) {
+      *dest = desired;
+      ret = 1;
+    } else {
+      ret = 0;
+    }
+  }
+
+  return ret;
+}
+#elif defined(HAVE_STDATOMIC)
+#  define munit_atomic_store(dest, value) atomic_store(dest, value)
+#  define munit_atomic_load(src) atomic_load(src)
+#  define munit_atomic_cas(dest, expected, value)                              \
+    atomic_compare_exchange_weak(dest, expected, value)
+#elif defined(HAVE_CLANG_ATOMICS)
+#  define munit_atomic_store(dest, value)                                      \
+    __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST)
+#  define munit_atomic_load(src) __c11_atomic_load(src, __ATOMIC_SEQ_CST)
+#  define munit_atomic_cas(dest, expected, value)                              \
+    __c11_atomic_compare_exchange_weak(dest, expected, value,                  \
+                                       __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#elif defined(__GNUC__) && (__GNUC__ > 4) ||                                   \
+  (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
+#  define munit_atomic_store(dest, value)                                      \
+    __atomic_store_n(dest, value, __ATOMIC_SEQ_CST)
+#  define munit_atomic_load(src) __atomic_load_n(src, __ATOMIC_SEQ_CST)
+#  define munit_atomic_cas(dest, expected, value)                              \
+    __atomic_compare_exchange_n(dest, expected, value, 1, __ATOMIC_SEQ_CST,    \
+                                __ATOMIC_SEQ_CST)
+#elif defined(__GNUC__) && (__GNUC__ >= 4)
+#  define munit_atomic_store(dest, value)                                      \
+    do {                                                                       \
+      *(dest) = (value);                                                       \
+    } while (0)
+#  define munit_atomic_load(src) (*(src))
+#  define munit_atomic_cas(dest, expected, value)                              \
+    __sync_bool_compare_and_swap(dest, *expected, value)
+#elif defined(_WIN32) /* Untested */
+#  define munit_atomic_store(dest, value)                                      \
+    do {                                                                       \
+      *(dest) = (value);                                                       \
+    } while (0)
+#  define munit_atomic_load(src) (*(src))
+#  define munit_atomic_cas(dest, expected, value)                              \
+    InterlockedCompareExchange((dest), (value), *(expected))
+#else
+#  warning No atomic implementation, PRNG will not be thread-safe
+#  define munit_atomic_store(dest, value)                                      \
+    do {                                                                       \
+      *(dest) = (value);                                                       \
+    } while (0)
+#  define munit_atomic_load(src) (*(src))
+static inline munit_bool munit_atomic_cas(ATOMIC_UINT32_T *dest,
+                                          ATOMIC_UINT32_T *expected,
+                                          ATOMIC_UINT32_T desired) {
+  if (*dest == *expected) {
+    *dest = desired;
+    return 1;
+  } else {
+    return 0;
+  }
+}
+#endif
+
+#define MUNIT_PRNG_MULTIPLIER (747796405U)
+#define MUNIT_PRNG_INCREMENT (1729U)
+
+static munit_uint32_t munit_rand_next_state(munit_uint32_t state) {
+  return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT;
+}
+
+static munit_uint32_t munit_rand_from_state(munit_uint32_t state) {
+  munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U);
+  res ^= res >> 22;
+  return res;
+}
+
+void munit_rand_seed(munit_uint32_t seed) {
+  munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT);
+  munit_atomic_store(&munit_rand_state, state);
+}
+
+static munit_uint32_t munit_rand_generate_seed(void) {
+  munit_uint32_t seed, state;
+#if defined(MUNIT_ENABLE_TIMING)
+  struct PsnipClockTimespec wc = {
+    0,
+  };
+
+  psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc);
+  seed = (munit_uint32_t)wc.nanoseconds;
+#else
+  seed = (munit_uint32_t)time(NULL);
+#endif
+
+  state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT);
+  return munit_rand_from_state(state);
+}
+
+static munit_uint32_t munit_rand_state_uint32(munit_uint32_t *state) {
+  const munit_uint32_t old = *state;
+  *state = munit_rand_next_state(old);
+  return munit_rand_from_state(old);
+}
+
+munit_uint32_t munit_rand_uint32(void) {
+  munit_uint32_t old, state;
+
+  do {
+    old = munit_atomic_load(&munit_rand_state);
+    state = munit_rand_next_state(old);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+
+  return munit_rand_from_state(old);
+}
+
+static void munit_rand_state_memory(munit_uint32_t *state, size_t size,
+                                    munit_uint8_t *data) {
+  size_t members_remaining = size / sizeof(munit_uint32_t);
+  size_t bytes_remaining = size % sizeof(munit_uint32_t);
+  munit_uint8_t *b = data;
+  munit_uint32_t rv;
+  while (members_remaining-- > 0) {
+    rv = munit_rand_state_uint32(state);
+    memcpy(b, &rv, sizeof(munit_uint32_t));
+    b += sizeof(munit_uint32_t);
+  }
+  if (bytes_remaining != 0) {
+    rv = munit_rand_state_uint32(state);
+    memcpy(b, &rv, bytes_remaining);
+  }
+}
+
+void munit_rand_memory(size_t size, munit_uint8_t *data) {
+  munit_uint32_t old, state;
+
+  do {
+    state = old = munit_atomic_load(&munit_rand_state);
+    munit_rand_state_memory(&state, size, data);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+}
+
+static munit_uint32_t munit_rand_state_at_most(munit_uint32_t *state,
+                                               munit_uint32_t salt,
+                                               munit_uint32_t max) {
+  /* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same
+   * as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not
+   * to avoid compiler warnings.
+   */
+  const munit_uint32_t min = (~max + 1U) % max;
+  munit_uint32_t x;
+
+  if (max == (~((munit_uint32_t)0U)))
+    return munit_rand_state_uint32(state) ^ salt;
+
+  max++;
+
+  do {
+    x = munit_rand_state_uint32(state) ^ salt;
+  } while (x < min);
+
+  return x % max;
+}
+
+static munit_uint32_t munit_rand_at_most(munit_uint32_t salt,
+                                         munit_uint32_t max) {
+  munit_uint32_t old, state;
+  munit_uint32_t retval;
+
+  do {
+    state = old = munit_atomic_load(&munit_rand_state);
+    retval = munit_rand_state_at_most(&state, salt, max);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+
+  return retval;
+}
+
+int munit_rand_int_range(int min, int max) {
+  munit_uint64_t range = (munit_uint64_t)max - (munit_uint64_t)min;
+
+  if (min > max)
+    return munit_rand_int_range(max, min);
+
+  if (range > (~((munit_uint32_t)0U)))
+    range = (~((munit_uint32_t)0U));
+
+  return min + (int)munit_rand_at_most(0, (munit_uint32_t)range);
+}
+
+double munit_rand_double(void) {
+  munit_uint32_t old, state;
+  double retval = 0.0;
+
+  do {
+    state = old = munit_atomic_load(&munit_rand_state);
+
+    /* See http://mumble.net/~campbell/tmp/random_real.c for how to do
+     * this right.  Patches welcome if you feel that this is too
+     * biased. */
+    retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t)0U)) + 1.0);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+
+  return retval;
+}
+
+/*** Test suite handling ***/
+
+typedef struct {
+  unsigned int successful;
+  unsigned int skipped;
+  unsigned int failed;
+  unsigned int errored;
+#if defined(MUNIT_ENABLE_TIMING)
+  munit_uint64_t cpu_clock;
+  munit_uint64_t wall_clock;
+#endif
+} MunitReport;
+
+typedef struct {
+  const char *prefix;
+  const MunitSuite *suite;
+  const char **tests;
+  munit_uint32_t seed;
+  unsigned int iterations;
+  MunitParameter *parameters;
+  munit_bool single_parameter_mode;
+  void *user_data;
+  MunitReport report;
+  munit_bool colorize;
+  munit_bool fork;
+  munit_bool show_stderr;
+  munit_bool fatal_failures;
+} MunitTestRunner;
+
+const char *munit_parameters_get(const MunitParameter params[],
+                                 const char *key) {
+  const MunitParameter *param;
+
+  for (param = params; param != NULL && param->name != NULL; param++)
+    if (strcmp(param->name, key) == 0)
+      return param->value;
+  return NULL;
+}
+
+#if defined(MUNIT_ENABLE_TIMING)
+static void munit_print_time(FILE *fp, munit_uint64_t nanoseconds) {
+  fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT,
+          ((double)nanoseconds) / ((double)PSNIP_CLOCK_NSEC_PER_SEC));
+}
+#endif
+
+/* Add a paramter to an array of parameters. */
+static MunitResult munit_parameters_add(size_t *params_size,
+                                        MunitParameter **params, char *name,
+                                        char *value) {
+  *params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2));
+  if (*params == NULL)
+    return MUNIT_ERROR;
+
+  (*params)[*params_size].name = name;
+  (*params)[*params_size].value = value;
+  (*params_size)++;
+  (*params)[*params_size].name = NULL;
+  (*params)[*params_size].value = NULL;
+
+  return MUNIT_OK;
+}
+
+/* Concatenate two strings, but just return one of the components
+ * unaltered if the other is NULL or "". */
+static char *munit_maybe_concat(size_t *len, char *prefix, char *suffix) {
+  char *res;
+  size_t res_l;
+  const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0;
+  const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0;
+  if (prefix_l == 0 && suffix_l == 0) {
+    res = NULL;
+    res_l = 0;
+  } else if (prefix_l == 0 && suffix_l != 0) {
+    res = suffix;
+    res_l = suffix_l;
+  } else if (prefix_l != 0 && suffix_l == 0) {
+    res = prefix;
+    res_l = prefix_l;
+  } else {
+    res_l = prefix_l + suffix_l;
+    res = malloc(res_l + 1);
+    memcpy(res, prefix, prefix_l);
+    memcpy(res + prefix_l, suffix, suffix_l);
+    res[res_l] = 0;
+  }
+
+  if (len != NULL)
+    *len = res_l;
+
+  return res;
+}
+
+/* Possbily free a string returned by munit_maybe_concat. */
+static void munit_maybe_free_concat(char *s, const char *prefix,
+                                    const char *suffix) {
+  if (prefix != s && suffix != s)
+    free(s);
+}
+
+/* Cheap string hash function, just used to salt the PRNG. */
+static munit_uint32_t munit_str_hash(const char *name) {
+  const char *p;
+  munit_uint32_t h = 5381U;
+
+  for (p = name; *p != '\0'; p++)
+    h = (munit_uint32_t)(h << 5) + h + (munit_uint32_t)*p;
+
+  return h;
+}
+
+static void munit_splice(int from, int to) {
+  munit_uint8_t buf[1024];
+#if !defined(_WIN32)
+  ssize_t len;
+  ssize_t bytes_written;
+  ssize_t write_res;
+#else
+  int len;
+  int bytes_written;
+  int write_res;
+#endif
+  do {
+    len = read(from, buf, sizeof(buf));
+    if (len > 0) {
+      bytes_written = 0;
+      do {
+        write_res = write(to, buf + bytes_written,
+#if !defined(_WIN32)
+                          (size_t)
+#else
+                          (unsigned int)
+#endif
+                            (len - bytes_written));
+        if (write_res < 0)
+          break;
+        bytes_written += write_res;
+      } while (bytes_written < len);
+    } else
+      break;
+  } while (1);
+}
+
+/* This is the part that should be handled in the child process */
+static MunitResult munit_test_runner_exec(MunitTestRunner *runner,
+                                          const MunitTest *test,
+                                          const MunitParameter params[],
+                                          MunitReport *report) {
+  unsigned int iterations = runner->iterations;
+  MunitResult result = MUNIT_FAIL;
+#if defined(MUNIT_ENABLE_TIMING)
+  struct PsnipClockTimespec wall_clock_begin =
+                              {
+                                0,
+                              },
+                            wall_clock_end = {
+                              0,
+                            };
+  struct PsnipClockTimespec cpu_clock_begin =
+                              {
+                                0,
+                              },
+                            cpu_clock_end = {
+                              0,
+                            };
+#endif
+  unsigned int i = 0;
+
+  if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) ==
+      MUNIT_TEST_OPTION_SINGLE_ITERATION)
+    iterations = 1;
+  else if (iterations == 0)
+    iterations = runner->suite->iterations;
+
+  munit_rand_seed(runner->seed);
+
+  do {
+    void *data = (test->setup == NULL) ? runner->user_data
+                                       : test->setup(params, runner->user_data);
+
+#if defined(MUNIT_ENABLE_TIMING)
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin);
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin);
+#endif
+
+    result = test->test(params, data);
+
+#if defined(MUNIT_ENABLE_TIMING)
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end);
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end);
+#endif
+
+    if (test->tear_down != NULL)
+      test->tear_down(data);
+
+    if (MUNIT_LIKELY(result == MUNIT_OK)) {
+      report->successful++;
+#if defined(MUNIT_ENABLE_TIMING)
+      report->wall_clock +=
+        munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end);
+      report->cpu_clock +=
+        munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end);
+#endif
+    } else {
+      switch ((int)result) {
+      case MUNIT_SKIP:
+        report->skipped++;
+        break;
+      case MUNIT_FAIL:
+        report->failed++;
+        break;
+      case MUNIT_ERROR:
+        report->errored++;
+        break;
+      default:
+        break;
+      }
+      break;
+    }
+  } while (++i < iterations);
+
+  return result;
+}
+
+#if defined(MUNIT_EMOTICON)
+#  define MUNIT_RESULT_STRING_OK ":)"
+#  define MUNIT_RESULT_STRING_SKIP ":|"
+#  define MUNIT_RESULT_STRING_FAIL ":("
+#  define MUNIT_RESULT_STRING_ERROR ":o"
+#  define MUNIT_RESULT_STRING_TODO ":/"
+#else
+#  define MUNIT_RESULT_STRING_OK "OK   "
+#  define MUNIT_RESULT_STRING_SKIP "SKIP "
+#  define MUNIT_RESULT_STRING_FAIL "FAIL "
+#  define MUNIT_RESULT_STRING_ERROR "ERROR"
+#  define MUNIT_RESULT_STRING_TODO "TODO "
+#endif
+
+static void munit_test_runner_print_color(const MunitTestRunner *runner,
+                                          const char *string, char color) {
+  if (runner->colorize)
+    fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string);
+  else
+    fputs(string, MUNIT_OUTPUT_FILE);
+}
+
+#if !defined(MUNIT_NO_BUFFER)
+static int munit_replace_stderr(FILE *stderr_buf) {
+  if (stderr_buf != NULL) {
+    const int orig_stderr = dup(STDERR_FILENO);
+
+    int errfd = fileno(stderr_buf);
+    if (MUNIT_UNLIKELY(errfd == -1)) {
+      exit(EXIT_FAILURE);
+    }
+
+    dup2(errfd, STDERR_FILENO);
+
+    return orig_stderr;
+  }
+
+  return -1;
+}
+
+static void munit_restore_stderr(int orig_stderr) {
+  if (orig_stderr != -1) {
+    dup2(orig_stderr, STDERR_FILENO);
+    close(orig_stderr);
+  }
+}
+#endif /* !defined(MUNIT_NO_BUFFER) */
+
+/* Run a test with the specified parameters. */
+static void
+munit_test_runner_run_test_with_params(MunitTestRunner *runner,
+                                       const MunitTest *test,
+                                       const MunitParameter params[]) {
+  MunitResult result = MUNIT_OK;
+  MunitReport report = {0, 0, 0, 0,
+#if defined(MUNIT_ENABLE_TIMING)
+                        0, 0
+#endif
+  };
+  unsigned int output_l;
+  munit_bool first;
+  const MunitParameter *param;
+  FILE *stderr_buf;
+#if !defined(MUNIT_NO_FORK)
+  int pipefd[2];
+  pid_t fork_pid;
+  ssize_t bytes_written = 0;
+  ssize_t write_res;
+  ssize_t bytes_read = 0;
+  ssize_t read_res;
+  int status = 0;
+  pid_t changed_pid;
+#endif
+
+  if (params != NULL) {
+    output_l = 2;
+    fputs("  ", MUNIT_OUTPUT_FILE);
+    first = 1;
+    for (param = params; param != NULL && param->name != NULL; param++) {
+      if (!first) {
+        fputs(", ", MUNIT_OUTPUT_FILE);
+        output_l += 2;
+      } else {
+        first = 0;
+      }
+
+      output_l += (unsigned int)fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name,
+                                        param->value);
+    }
+    while (output_l++ < MUNIT_TEST_NAME_LEN) {
+      fputc(' ', MUNIT_OUTPUT_FILE);
+    }
+  }
+
+  fflush(MUNIT_OUTPUT_FILE);
+
+  stderr_buf = NULL;
+#if !defined(_WIN32) || defined(__MINGW32__)
+  stderr_buf = tmpfile();
+#else
+  tmpfile_s(&stderr_buf);
+#endif
+  if (stderr_buf == NULL) {
+    munit_log_errno(MUNIT_LOG_ERROR, stderr,
+                    "unable to create buffer for stderr");
+    result = MUNIT_ERROR;
+    goto print_result;
+  }
+
+#if !defined(MUNIT_NO_FORK)
+  if (runner->fork) {
+    pipefd[0] = -1;
+    pipefd[1] = -1;
+    if (pipe(pipefd) != 0) {
+      munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe");
+      result = MUNIT_ERROR;
+      goto print_result;
+    }
+
+    fork_pid = fork();
+    if (fork_pid == 0) {
+      int orig_stderr;
+
+      close(pipefd[0]);
+
+      orig_stderr = munit_replace_stderr(stderr_buf);
+      munit_test_runner_exec(runner, test, params, &report);
+
+      /* Note that we don't restore stderr.  This is so we can buffer
+       * things written to stderr later on (such as by
+       * asan/tsan/ubsan, valgrind, etc.) */
+      close(orig_stderr);
+
+      do {
+        write_res =
+          write(pipefd[1], ((munit_uint8_t *)(&report)) + bytes_written,
+                sizeof(report) - (size_t)bytes_written);
+        if (write_res < 0) {
+          if (stderr_buf != NULL) {
+            munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe");
+          }
+          exit(EXIT_FAILURE);
+        }
+        bytes_written += write_res;
+      } while ((size_t)bytes_written < sizeof(report));
+
+      if (stderr_buf != NULL)
+        fclose(stderr_buf);
+      close(pipefd[1]);
+
+      exit(EXIT_SUCCESS);
+    } else if (fork_pid == -1) {
+      close(pipefd[0]);
+      close(pipefd[1]);
+      if (stderr_buf != NULL) {
+        munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork");
+      }
+      report.errored++;
+      result = MUNIT_ERROR;
+    } else {
+      close(pipefd[1]);
+      do {
+        read_res = read(pipefd[0], ((munit_uint8_t *)(&report)) + bytes_read,
+                        sizeof(report) - (size_t)bytes_read);
+        if (read_res < 1)
+          break;
+        bytes_read += read_res;
+      } while (bytes_read < (ssize_t)sizeof(report));
+
+      changed_pid = waitpid(fork_pid, &status, 0);
+
+      if (MUNIT_LIKELY(changed_pid == fork_pid) &&
+          MUNIT_LIKELY(WIFEXITED(status))) {
+        if (bytes_read != sizeof(report)) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf,
+                              "child exited unexpectedly with status %d",
+                              WEXITSTATUS(status));
+          report.errored++;
+        } else if (WEXITSTATUS(status) != EXIT_SUCCESS) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf,
+                              "child exited with status %d",
+                              WEXITSTATUS(status));
+          report.errored++;
+        }
+      } else {
+        if (WIFSIGNALED(status)) {
+#  if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700)
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf,
+                              "child killed by signal %d (%s)",
+                              WTERMSIG(status), strsignal(WTERMSIG(status)));
+#  else
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf,
+                              "child killed by signal %d", WTERMSIG(status));
+#  endif
+        } else if (WIFSTOPPED(status)) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf,
+                              "child stopped by signal %d", WSTOPSIG(status));
+        }
+        report.errored++;
+      }
+
+      close(pipefd[0]);
+      waitpid(fork_pid, NULL, 0);
+    }
+  } else
+#endif
+  {
+#if !defined(MUNIT_NO_BUFFER)
+    const volatile int orig_stderr = munit_replace_stderr(stderr_buf);
+#endif
+
+#if defined(MUNIT_THREAD_LOCAL)
+    if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) {
+      result = MUNIT_FAIL;
+      report.failed++;
+    } else {
+      munit_error_jmp_buf_valid = 1;
+      result = munit_test_runner_exec(runner, test, params, &report);
+    }
+#else
+    result = munit_test_runner_exec(runner, test, params, &report);
+#endif
+
+#if !defined(MUNIT_NO_BUFFER)
+    munit_restore_stderr(orig_stderr);
+#endif
+
+    /* Here just so that the label is used on Windows and we don't get
+     * a warning */
+    goto print_result;
+  }
+
+print_result:
+
+  fputs("[ ", MUNIT_OUTPUT_FILE);
+  if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) {
+    if (report.failed != 0 || report.errored != 0 || report.skipped != 0) {
+      munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3');
+      result = MUNIT_OK;
+    } else {
+      munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1');
+      if (MUNIT_LIKELY(stderr_buf != NULL))
+        munit_log_internal(MUNIT_LOG_ERROR, stderr_buf,
+                           "Test marked TODO, but was successful.");
+      runner->report.failed++;
+      result = MUNIT_ERROR;
+    }
+  } else if (report.failed > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1');
+    runner->report.failed++;
+    result = MUNIT_FAIL;
+  } else if (report.errored > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1');
+    runner->report.errored++;
+    result = MUNIT_ERROR;
+  } else if (report.skipped > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3');
+    runner->report.skipped++;
+    result = MUNIT_SKIP;
+  } else if (report.successful > 1) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2');
+#if defined(MUNIT_ENABLE_TIMING)
+    fputs(" ] [ ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful);
+    fputs(" / ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful);
+    fprintf(MUNIT_OUTPUT_FILE,
+            " CPU ]\n  %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ",
+            "");
+    munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock);
+    fputs(" / ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock);
+    fputs(" CPU", MUNIT_OUTPUT_FILE);
+#endif
+    runner->report.successful++;
+    result = MUNIT_OK;
+  } else if (report.successful > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2');
+#if defined(MUNIT_ENABLE_TIMING)
+    fputs(" ] [ ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock);
+    fputs(" / ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock);
+    fputs(" CPU", MUNIT_OUTPUT_FILE);
+#endif
+    runner->report.successful++;
+    result = MUNIT_OK;
+  }
+  fputs(" ]\n", MUNIT_OUTPUT_FILE);
+
+  if (stderr_buf != NULL) {
+    if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) {
+      fflush(MUNIT_OUTPUT_FILE);
+
+      rewind(stderr_buf);
+      munit_splice(fileno(stderr_buf), STDERR_FILENO);
+
+      fflush(stderr);
+    }
+
+    fclose(stderr_buf);
+  }
+}
+
+static void munit_test_runner_run_test_wild(MunitTestRunner *runner,
+                                            const MunitTest *test,
+                                            const char *test_name,
+                                            MunitParameter *params,
+                                            MunitParameter *p) {
+  const MunitParameterEnum *pe;
+  char **values;
+  MunitParameter *next;
+
+  for (pe = test->parameters; pe != NULL && pe->name != NULL; pe++) {
+    if (p->name == pe->name)
+      break;
+  }
+
+  if (pe == NULL)
+    return;
+
+  for (values = pe->values; *values != NULL; values++) {
+    next = p + 1;
+    p->value = *values;
+    if (next->name == NULL) {
+      munit_test_runner_run_test_with_params(runner, test, params);
+    } else {
+      munit_test_runner_run_test_wild(runner, test, test_name, params, next);
+    }
+    if (runner->fatal_failures &&
+        (runner->report.failed != 0 || runner->report.errored != 0))
+      break;
+  }
+}
+
+/* Run a single test, with every combination of parameters
+ * requested. */
+static void munit_test_runner_run_test(MunitTestRunner *runner,
+                                       const MunitTest *test,
+                                       const char *prefix) {
+  char *test_name =
+    munit_maybe_concat(NULL, (char *)prefix, (char *)test->name);
+  /* The array of parameters to pass to
+   * munit_test_runner_run_test_with_params */
+  MunitParameter *params = NULL;
+  size_t params_l = 0;
+  /* Wildcard parameters are parameters which have possible values
+   * specified in the test, but no specific value was passed to the
+   * CLI.  That means we want to run the test once for every
+   * possible combination of parameter values or, if --single was
+   * passed to the CLI, a single time with a random set of
+   * parameters. */
+  MunitParameter *wild_params = NULL;
+  size_t wild_params_l = 0;
+  const MunitParameterEnum *pe;
+  const MunitParameter *cli_p;
+  munit_bool filled;
+  unsigned int possible;
+  char **vals;
+  size_t first_wild;
+  const MunitParameter *wp;
+  int pidx;
+
+  munit_rand_seed(runner->seed);
+
+  fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s",
+          test_name);
+
+  if (test->parameters == NULL) {
+    /* No parameters.  Simple, nice. */
+    munit_test_runner_run_test_with_params(runner, test, NULL);
+  } else {
+    fputc('\n', MUNIT_OUTPUT_FILE);
+
+    for (pe = test->parameters; pe != NULL && pe->name != NULL; pe++) {
+      /* Did we received a value for this parameter from the CLI? */
+      filled = 0;
+      for (cli_p = runner->parameters; cli_p != NULL && cli_p->name != NULL;
+           cli_p++) {
+        if (strcmp(cli_p->name, pe->name) == 0) {
+          if (MUNIT_UNLIKELY(munit_parameters_add(&params_l, &params, pe->name,
+                                                  cli_p->value) != MUNIT_OK))
+            goto cleanup;
+          filled = 1;
+          break;
+        }
+      }
+      if (filled)
+        continue;
+
+      /* Nothing from CLI, is the enum NULL/empty?  We're not a
+       * fuzzer… */
+      if (pe->values == NULL || pe->values[0] == NULL)
+        continue;
+
+      /* If --single was passed to the CLI, choose a value from the
+       * list of possibilities randomly. */
+      if (runner->single_parameter_mode) {
+        possible = 0;
+        for (vals = pe->values; *vals != NULL; vals++)
+          possible++;
+        /* We want the tests to be reproducible, even if you're only
+         * running a single test, but we don't want every test with
+         * the same number of parameters to choose the same parameter
+         * number, so use the test name as a primitive salt. */
+        pidx = (int)munit_rand_at_most(munit_str_hash(test_name), possible - 1);
+        if (MUNIT_UNLIKELY(munit_parameters_add(&params_l, &params, pe->name,
+                                                pe->values[pidx]) != MUNIT_OK))
+          goto cleanup;
+      } else {
+        /* We want to try every permutation.  Put in a placeholder
+         * entry, we'll iterate through them later. */
+        if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params,
+                                                pe->name, NULL) != MUNIT_OK))
+          goto cleanup;
+      }
+    }
+
+    if (wild_params_l != 0) {
+      first_wild = params_l;
+      for (wp = wild_params; wp != NULL && wp->name != NULL; wp++) {
+        for (pe = test->parameters;
+             pe != NULL && pe->name != NULL && pe->values != NULL; pe++) {
+          if (strcmp(wp->name, pe->name) == 0) {
+            if (MUNIT_UNLIKELY(munit_parameters_add(&params_l, &params,
+                                                    pe->name,
+                                                    pe->values[0]) != MUNIT_OK))
+              goto cleanup;
+          }
+        }
+      }
+
+      munit_test_runner_run_test_wild(runner, test, test_name, params,
+                                      params + first_wild);
+    } else {
+      munit_test_runner_run_test_with_params(runner, test, params);
+    }
+
+  cleanup:
+    free(params);
+    free(wild_params);
+  }
+
+  munit_maybe_free_concat(test_name, prefix, test->name);
+}
+
+/* Recurse through the suite and run all the tests.  If a list of
+ * tests to run was provied on the command line, run only those
+ * tests.  */
+static void munit_test_runner_run_suite(MunitTestRunner *runner,
+                                        const MunitSuite *suite,
+                                        const char *prefix) {
+  size_t pre_l;
+  char *pre = munit_maybe_concat(&pre_l, (char *)prefix, (char *)suite->prefix);
+  const MunitTest *test;
+  const char **test_name;
+  const MunitSuite *child_suite;
+
+  /* Run the tests. */
+  for (test = suite->tests; test != NULL && test->test != NULL; test++) {
+    if (runner->tests != NULL) { /* Specific tests were requested on the CLI */
+      for (test_name = runner->tests; test_name != NULL && *test_name != NULL;
+           test_name++) {
+        if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) &&
+            strncmp(test->name, *test_name + pre_l,
+                    strlen(*test_name + pre_l)) == 0) {
+          munit_test_runner_run_test(runner, test, pre);
+          if (runner->fatal_failures &&
+              (runner->report.failed != 0 || runner->report.errored != 0))
+            goto cleanup;
+        }
+      }
+    } else { /* Run all tests */
+      munit_test_runner_run_test(runner, test, pre);
+    }
+  }
+
+  if (runner->fatal_failures &&
+      (runner->report.failed != 0 || runner->report.errored != 0))
+    goto cleanup;
+
+  /* Run any child suites. */
+  for (child_suite = suite->suites;
+       child_suite != NULL && child_suite->prefix != NULL; child_suite++) {
+    munit_test_runner_run_suite(runner, child_suite, pre);
+  }
+
+cleanup:
+
+  munit_maybe_free_concat(pre, prefix, suite->prefix);
+}
+
+static void munit_test_runner_run(MunitTestRunner *runner) {
+  munit_test_runner_run_suite(runner, runner->suite, NULL);
+}
+
+static void munit_print_help(int argc, char *const *argv, void *user_data,
+                             const MunitArgument arguments[]) {
+  const MunitArgument *arg;
+  (void)argc;
+
+  printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]);
+  puts(
+    " --seed SEED\n"
+    "           Value used to seed the PRNG.  Must be a 32-bit integer in "
+    "decimal\n"
+    "           notation with no separators (commas, decimals, spaces, "
+    "etc.), or\n"
+    "           hexidecimal prefixed by \"0x\".\n"
+    " --iterations N\n"
+    "           Run each test N times.  0 means the default number.\n"
+    " --param name value\n"
+    "           A parameter key/value pair which will be passed to any test "
+    "with\n"
+    "           takes a parameter of that name.  If not provided, the test "
+    "will be\n"
+    "           run once for each possible parameter value.\n"
+    " --list    Write a list of all available tests.\n"
+    " --list-params\n"
+    "           Write a list of all available tests and their possible "
+    "parameters.\n"
+    " --single  Run each parameterized test in a single configuration "
+    "instead of\n"
+    "           every possible combination\n"
+    " --log-visible debug|info|warning|error\n"
+    " --log-fatal debug|info|warning|error\n"
+    "           Set the level at which messages of different severities are "
+    "visible,\n"
+    "           or cause the test to terminate.\n"
+#if !defined(MUNIT_NO_FORK)
+    " --no-fork Do not execute tests in a child process.  If this option is "
+    "supplied\n"
+    "           and a test crashes (including by failing an assertion), no "
+    "further\n"
+    "           tests will be performed.\n"
+#endif
+    " --fatal-failures\n"
+    "           Stop executing tests as soon as a failure is found.\n"
+    " --show-stderr\n"
+    "           Show data written to stderr by the tests, even if the test "
+    "succeeds.\n"
+    " --color auto|always|never\n"
+    "           Colorize (or don't) the output.\n"
+    /* 12345678901234567890123456789012345678901234567890123456789012345678901234567890
+     */
+    " --help    Print this help message and exit.\n");
+#if defined(MUNIT_NL_LANGINFO)
+  setlocale(LC_ALL, "");
+  fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit",
+        stdout);
+#else
+  puts("munit");
+#endif
+  printf(" %d.%d.%d\n"
+         "Full documentation at: https://nemequ.github.io/munit/\n",
+         (MUNIT_CURRENT_VERSION >> 16) & 0xff,
+         (MUNIT_CURRENT_VERSION >> 8) & 0xff,
+         (MUNIT_CURRENT_VERSION >> 0) & 0xff);
+  for (arg = arguments; arg != NULL && arg->name != NULL; arg++)
+    arg->write_help(arg, user_data);
+}
+
+static const MunitArgument *
+munit_arguments_find(const MunitArgument arguments[], const char *name) {
+  const MunitArgument *arg;
+
+  for (arg = arguments; arg != NULL && arg->name != NULL; arg++)
+    if (strcmp(arg->name, name) == 0)
+      return arg;
+
+  return NULL;
+}
+
+static void munit_suite_list_tests(const MunitSuite *suite,
+                                   munit_bool show_params, const char *prefix) {
+  size_t pre_l;
+  char *pre = munit_maybe_concat(&pre_l, (char *)prefix, (char *)suite->prefix);
+  const MunitTest *test;
+  const MunitParameterEnum *params;
+  munit_bool first;
+  char **val;
+  const MunitSuite *child_suite;
+
+  for (test = suite->tests; test != NULL && test->name != NULL; test++) {
+    if (pre != NULL)
+      fputs(pre, stdout);
+    puts(test->name);
+
+    if (show_params) {
+      for (params = test->parameters; params != NULL && params->name != NULL;
+           params++) {
+        fprintf(stdout, " - %s: ", params->name);
+        if (params->values == NULL) {
+          puts("Any");
+        } else {
+          first = 1;
+          for (val = params->values; *val != NULL; val++) {
+            if (!first) {
+              fputs(", ", stdout);
+            } else {
+              first = 0;
+            }
+            fputs(*val, stdout);
+          }
+          putc('\n', stdout);
+        }
+      }
+    }
+  }
+
+  for (child_suite = suite->suites;
+       child_suite != NULL && child_suite->prefix != NULL; child_suite++) {
+    munit_suite_list_tests(child_suite, show_params, pre);
+  }
+
+  munit_maybe_free_concat(pre, prefix, suite->prefix);
+}
+
+static munit_bool munit_stream_supports_ansi(FILE *stream) {
+#if !defined(_WIN32)
+  return isatty(fileno(stream));
+#else
+
+#  if !defined(__MINGW32__)
+  size_t ansicon_size = 0;
+#  endif
+
+  if (isatty(fileno(stream))) {
+#  if !defined(__MINGW32__)
+    getenv_s(&ansicon_size, NULL, 0, "ANSICON");
+    return ansicon_size != 0;
+#  else
+    return getenv("ANSICON") != NULL;
+#  endif
+  }
+  return 0;
+#endif
+}
+
+int munit_suite_main_custom(const MunitSuite *suite, void *user_data, int argc,
+                            char *const *argv,
+                            const MunitArgument arguments[]) {
+  int result = EXIT_FAILURE;
+  MunitTestRunner runner;
+  size_t parameters_size = 0;
+  size_t tests_size = 0;
+  int arg;
+
+  char *envptr;
+  unsigned long ts;
+  char *endptr;
+  unsigned long long iterations;
+  MunitLogLevel level;
+  const MunitArgument *argument;
+  const char **runner_tests;
+  unsigned int tests_run;
+  unsigned int tests_total;
+
+  runner.prefix = NULL;
+  runner.suite = NULL;
+  runner.tests = NULL;
+  runner.seed = 0;
+  runner.iterations = 0;
+  runner.parameters = NULL;
+  runner.single_parameter_mode = 0;
+  runner.user_data = NULL;
+
+  runner.report.successful = 0;
+  runner.report.skipped = 0;
+  runner.report.failed = 0;
+  runner.report.errored = 0;
+#if defined(MUNIT_ENABLE_TIMING)
+  runner.report.cpu_clock = 0;
+  runner.report.wall_clock = 0;
+#endif
+
+  runner.colorize = 0;
+#if !defined(_WIN32)
+  runner.fork = 1;
+#else
+  runner.fork = 0;
+#endif
+  runner.show_stderr = 0;
+  runner.fatal_failures = 0;
+  runner.suite = suite;
+  runner.user_data = user_data;
+  runner.seed = munit_rand_generate_seed();
+  runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE);
+
+  for (arg = 1; arg < argc; arg++) {
+    if (strncmp("--", argv[arg], 2) == 0) {
+      if (strcmp("seed", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        envptr = argv[arg + 1];
+        ts = strtoul(argv[arg + 1], &envptr, 0);
+        if (*envptr != '\0' || ts > (~((munit_uint32_t)0U))) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "invalid value ('%s') passed to %s",
+                              argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+        runner.seed = (munit_uint32_t)ts;
+
+        arg++;
+      } else if (strcmp("iterations", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        endptr = argv[arg + 1];
+        iterations = strtoul(argv[arg + 1], &endptr, 0);
+        if (*endptr != '\0' || iterations > UINT_MAX) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "invalid value ('%s') passed to %s",
+                              argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+
+        runner.iterations = (unsigned int)iterations;
+
+        arg++;
+      } else if (strcmp("param", argv[arg] + 2) == 0) {
+        if (arg + 2 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "%s requires two arguments", argv[arg]);
+          goto cleanup;
+        }
+
+        runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) *
+                                                         (parameters_size + 2));
+        if (runner.parameters == NULL) {
+          munit_log_internal(MUNIT_LOG_ERROR, stderr,
+                             "failed to allocate memory");
+          goto cleanup;
+        }
+        runner.parameters[parameters_size].name = (char *)argv[arg + 1];
+        runner.parameters[parameters_size].value = (char *)argv[arg + 2];
+        parameters_size++;
+        runner.parameters[parameters_size].name = NULL;
+        runner.parameters[parameters_size].value = NULL;
+        arg += 2;
+      } else if (strcmp("color", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        if (strcmp(argv[arg + 1], "always") == 0)
+          runner.colorize = 1;
+        else if (strcmp(argv[arg + 1], "never") == 0)
+          runner.colorize = 0;
+        else if (strcmp(argv[arg + 1], "auto") == 0)
+          runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE);
+        else {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "invalid value ('%s') passed to %s",
+                              argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+
+        arg++;
+      } else if (strcmp("help", argv[arg] + 2) == 0) {
+        munit_print_help(argc, argv, user_data, arguments);
+        result = EXIT_SUCCESS;
+        goto cleanup;
+      } else if (strcmp("single", argv[arg] + 2) == 0) {
+        runner.single_parameter_mode = 1;
+      } else if (strcmp("show-stderr", argv[arg] + 2) == 0) {
+        runner.show_stderr = 1;
+#if !defined(_WIN32)
+      } else if (strcmp("no-fork", argv[arg] + 2) == 0) {
+        runner.fork = 0;
+#endif
+      } else if (strcmp("fatal-failures", argv[arg] + 2) == 0) {
+        runner.fatal_failures = 1;
+      } else if (strcmp("log-visible", argv[arg] + 2) == 0 ||
+                 strcmp("log-fatal", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        if (strcmp(argv[arg + 1], "debug") == 0)
+          level = MUNIT_LOG_DEBUG;
+        else if (strcmp(argv[arg + 1], "info") == 0)
+          level = MUNIT_LOG_INFO;
+        else if (strcmp(argv[arg + 1], "warning") == 0)
+          level = MUNIT_LOG_WARNING;
+        else if (strcmp(argv[arg + 1], "error") == 0)
+          level = MUNIT_LOG_ERROR;
+        else {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "invalid value ('%s') passed to %s",
+                              argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+
+        if (strcmp("log-visible", argv[arg] + 2) == 0)
+          munit_log_level_visible = level;
+        else
+          munit_log_level_fatal = level;
+
+        arg++;
+      } else if (strcmp("list", argv[arg] + 2) == 0) {
+        munit_suite_list_tests(suite, 0, NULL);
+        result = EXIT_SUCCESS;
+        goto cleanup;
+      } else if (strcmp("list-params", argv[arg] + 2) == 0) {
+        munit_suite_list_tests(suite, 1, NULL);
+        result = EXIT_SUCCESS;
+        goto cleanup;
+      } else {
+        argument = munit_arguments_find(arguments, argv[arg] + 2);
+        if (argument == NULL) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr,
+                              "unknown argument ('%s')", argv[arg]);
+          goto cleanup;
+        }
+
+        if (!argument->parse_argument(suite, user_data, &arg, argc, argv))
+          goto cleanup;
+      }
+    } else {
+      runner_tests =
+        realloc((void *)runner.tests, sizeof(char *) * (tests_size + 2));
+      if (runner_tests == NULL) {
+        munit_log_internal(MUNIT_LOG_ERROR, stderr,
+                           "failed to allocate memory");
+        goto cleanup;
+      }
+      runner.tests = runner_tests;
+      runner.tests[tests_size++] = argv[arg];
+      runner.tests[tests_size] = NULL;
+    }
+  }
+
+  fflush(stderr);
+  fprintf(MUNIT_OUTPUT_FILE,
+          "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed);
+
+  munit_test_runner_run(&runner);
+
+  tests_run =
+    runner.report.successful + runner.report.failed + runner.report.errored;
+  tests_total = tests_run + runner.report.skipped;
+  if (tests_run == 0) {
+    fprintf(stderr, "No tests run, %d (100%%) skipped.\n",
+            runner.report.skipped);
+  } else {
+    fprintf(MUNIT_OUTPUT_FILE,
+            "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n",
+            runner.report.successful, tests_run,
+            (((double)runner.report.successful) / ((double)tests_run)) * 100.0,
+            runner.report.skipped,
+            (((double)runner.report.skipped) / ((double)tests_total)) * 100.0);
+  }
+
+  if (runner.report.failed == 0 && runner.report.errored == 0) {
+    result = EXIT_SUCCESS;
+  }
+
+cleanup:
+  free(runner.parameters);
+  free((void *)runner.tests);
+
+  return result;
+}
+
+int munit_suite_main(const MunitSuite *suite, void *user_data, int argc,
+                     char *const *argv) {
+  return munit_suite_main_custom(suite, user_data, argc, argv, NULL);
+}
+
+static uint8_t hexchars[] = "0123456789abcdef";
+
+static uint8_t *hexdump_addr(uint8_t *dest, size_t addr) {
+  size_t i;
+  uint8_t a;
+
+  for (i = 0; i < 4; ++i) {
+    a = (addr >> (3 - i) * 8) & 0xff;
+
+    *dest++ = hexchars[a >> 4];
+    *dest++ = hexchars[a & 0xf];
+  }
+
+  return dest;
+}
+
+static uint8_t *asciidump(uint8_t *dest, const uint8_t *data, size_t datalen) {
+  size_t i;
+
+  *dest++ = '|';
+
+  for (i = 0; i < datalen; ++i) {
+    if (0x20 <= data[i] && data[i] <= 0x7e) {
+      *dest++ = data[i];
+    } else {
+      *dest++ = '.';
+    }
+  }
+
+  *dest++ = '|';
+
+  return dest;
+}
+
+static uint8_t *hexdump8(uint8_t *dest, const uint8_t *data, size_t datalen) {
+  size_t i;
+
+  for (i = 0; i < datalen; ++i) {
+    *dest++ = hexchars[data[i] >> 4];
+    *dest++ = hexchars[data[i] & 0xf];
+    *dest++ = ' ';
+  }
+
+  for (; i < 8; ++i) {
+    *dest++ = ' ';
+    *dest++ = ' ';
+    *dest++ = ' ';
+  }
+
+  return dest;
+}
+
+static uint8_t *hexdump16(uint8_t *dest, const uint8_t *data, size_t datalen) {
+  dest = hexdump8(dest, data, datalen < 8 ? datalen : 8);
+  *dest++ = ' ';
+
+  if (datalen < 8) {
+    data = NULL;
+    datalen = 0;
+  } else {
+    data += 8;
+    datalen -= 8;
+  }
+
+  dest = hexdump8(dest, data, datalen);
+  *dest++ = ' ';
+
+  return dest;
+}
+
+static uint8_t *hexdump_line(uint8_t *dest, const uint8_t *data, size_t datalen,
+                             size_t addr) {
+  dest = hexdump_addr(dest, addr);
+  *dest++ = ' ';
+  *dest++ = ' ';
+
+  dest = hexdump16(dest, data, datalen);
+
+  dest = asciidump(dest, data, datalen);
+
+  return dest;
+}
+
+int munit_hexdump(FILE *fp, const void *data, size_t datalen) {
+  size_t offset = 0, n, len;
+  uint8_t buf[128], *p;
+  const uint8_t *s;
+  int repeated = 0;
+
+  if (datalen == 0) {
+    return 0;
+  }
+
+  for (; offset < datalen; offset += 16) {
+    n = datalen - offset;
+    s = (const uint8_t *)data + offset;
+
+    if (n >= 16) {
+      n = 16;
+
+      if (offset > 0) {
+        if (memcmp(s - 16, s, 16) == 0) {
+          if (repeated) {
+            continue;
+          }
+
+          repeated = 1;
+
+          if (fwrite("*\n", 1, 2, fp) < 2) {
+            return -1;
+          }
+
+          continue;
+        }
+
+        repeated = 0;
+      }
+    }
+
+    p = hexdump_line(buf, s, n, offset);
+    *p++ = '\n';
+
+    len = (size_t)(p - buf);
+
+    if (fwrite(buf, 1, len, fp) < len) {
+      return -1;
+    }
+  }
+
+  p = hexdump_addr(buf, datalen);
+  *p++ = '\n';
+
+  len = (size_t)(p - buf);
+
+  if (fwrite(buf, 1, len, fp) < len) {
+    return -1;
+  }
+
+  return 0;
+}
+
+int munit_hexdump_diff(FILE *fp, const void *a, size_t alen, const void *b,
+                       size_t blen) {
+  size_t offset = 0, k, i, len, ncomp, maxlen, adoff = 0;
+  uint8_t buf[128], *p;
+  const uint8_t mk[2] = {'-', '+'};
+  struct datasource {
+    const uint8_t *data;
+    size_t datalen;
+    const uint8_t *s;
+    size_t n;
+  } ds[] = {{a, alen, NULL, 0}, {b, blen, NULL, 0}}, *dp;
+
+  maxlen = alen < blen ? blen : alen;
+
+  for (; offset < maxlen; offset += 16) {
+    for (k = 0; k < 2; ++k) {
+      dp = &ds[k];
+
+      if (offset < dp->datalen) {
+        dp->s = (const uint8_t *)dp->data + offset;
+        dp->n = dp->datalen - offset;
+
+        if (dp->n > 16) {
+          dp->n = 16;
+        }
+      } else {
+        dp->s = NULL;
+        dp->n = 0;
+      }
+    }
+
+    if (ds[0].n == ds[1].n && memcmp(ds[0].s, ds[1].s, ds[0].n) == 0) {
+      continue;
+    }
+
+    for (k = 0; k < 2; ++k) {
+      dp = &ds[k];
+
+      if (!dp->n) {
+        continue;
+      }
+
+      p = buf;
+      *p++ = mk[k];
+      *p++ = mk[k];
+      *p++ = mk[k];
+      *p++ = mk[k];
+
+      p = hexdump_line(p, dp->s, dp->n, offset);
+      *p++ = '\n';
+
+      len = (size_t)(p - buf);
+
+      if (fwrite(buf, 1, len, fp) < len) {
+        return -1;
+      }
+    }
+
+    if (!ds[0].n || !ds[1].n) {
+      continue;
+    }
+
+    ncomp = ds[0].n < ds[1].n ? ds[0].n : ds[1].n;
+
+    p = buf + 4 + 10;
+
+    memset(buf, ' ', 4 + 78);
+
+    for (i = 0; i < ncomp; ++i) {
+      if (ds[0].s[i] == ds[1].s[i]) {
+        *p++ = ' ';
+        *p++ = ' ';
+      } else {
+        adoff = 4 + 10 + 51 + i;
+        *(buf + adoff) = '^';
+
+        *p++ = '^';
+        *p++ = '^';
+      }
+
+      *p++ = ' ';
+
+      if (i == 7) {
+        *p++ = ' ';
+      }
+    }
+
+    if (adoff) {
+      len = adoff + 1;
+    } else {
+      len = (size_t)(p - buf);
+    }
+
+    buf[len++] = '\n';
+
+    if (fwrite(buf, 1, len, fp) < len) {
+      return -1;
+    }
+  }
+
+  return 0;
+}
diff --git a/tests/unit/munit.h b/tests/unit/munit.h
new file mode 100644
index 00000000000..b10d10ee0a5
--- /dev/null
+++ b/tests/unit/munit.h
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: MIT
+/* µnit Testing Framework
+ * Copyright (c) 2013-2017 Evan Nemerson <evan@nemerson.com>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MUNIT_H
+#define MUNIT_H
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+
+#define MUNIT_VERSION(major, minor, revision)                                  \
+  (((major) << 16) | ((minor) << 8) | (revision))
+
+#define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1)
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#  define munit_int8_t __int8
+#  define munit_uint8_t unsigned __int8
+#  define munit_int16_t __int16
+#  define munit_uint16_t unsigned __int16
+#  define munit_int32_t __int32
+#  define munit_uint32_t unsigned __int32
+#  define munit_int64_t __int64
+#  define munit_uint64_t unsigned __int64
+#else
+#  include <stdint.h>
+#  define munit_int8_t int8_t
+#  define munit_uint8_t uint8_t
+#  define munit_int16_t int16_t
+#  define munit_uint16_t uint16_t
+#  define munit_int32_t int32_t
+#  define munit_uint32_t uint32_t
+#  define munit_int64_t int64_t
+#  define munit_uint64_t uint64_t
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+#  if !defined(PRIi8)
+#    define PRIi8 "i"
+#  endif
+#  if !defined(PRIi16)
+#    define PRIi16 "i"
+#  endif
+#  if !defined(PRIi32)
+#    define PRIi32 "i"
+#  endif
+#  if !defined(PRIi64)
+#    define PRIi64 "I64i"
+#  endif
+#  if !defined(PRId8)
+#    define PRId8 "d"
+#  endif
+#  if !defined(PRId16)
+#    define PRId16 "d"
+#  endif
+#  if !defined(PRId32)
+#    define PRId32 "d"
+#  endif
+#  if !defined(PRId64)
+#    define PRId64 "I64d"
+#  endif
+#  if !defined(PRIx8)
+#    define PRIx8 "x"
+#  endif
+#  if !defined(PRIx16)
+#    define PRIx16 "x"
+#  endif
+#  if !defined(PRIx32)
+#    define PRIx32 "x"
+#  endif
+#  if !defined(PRIx64)
+#    define PRIx64 "I64x"
+#  endif
+#  if !defined(PRIu8)
+#    define PRIu8 "u"
+#  endif
+#  if !defined(PRIu16)
+#    define PRIu16 "u"
+#  endif
+#  if !defined(PRIu32)
+#    define PRIu32 "u"
+#  endif
+#  if !defined(PRIu64)
+#    define PRIu64 "I64u"
+#  endif
+#else
+#  include <inttypes.h>
+#endif
+
+#if !defined(munit_bool)
+#  if defined(bool)
+#    define munit_bool bool
+#  elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#    define munit_bool _Bool
+#  else
+#    define munit_bool int
+#  endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+#  define MUNIT_LIKELY(expr) (__builtin_expect((expr), 1))
+#  define MUNIT_UNLIKELY(expr) (__builtin_expect((expr), 0))
+#  define MUNIT_UNUSED __attribute__((__unused__))
+#else
+#  define MUNIT_LIKELY(expr) (expr)
+#  define MUNIT_UNLIKELY(expr) (expr)
+#  define MUNIT_UNUSED
+#endif
+
+#if !defined(_WIN32)
+#  define MUNIT_SIZE_MODIFIER "z"
+#  define MUNIT_CHAR_MODIFIER "hh"
+#  define MUNIT_SHORT_MODIFIER "h"
+#else
+#  if defined(_M_X64) || defined(__amd64__)
+#    define MUNIT_SIZE_MODIFIER "I64"
+#  else
+#    define MUNIT_SIZE_MODIFIER ""
+#  endif
+#  define MUNIT_CHAR_MODIFIER ""
+#  define MUNIT_SHORT_MODIFIER ""
+#endif
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#  define MUNIT_NO_RETURN _Noreturn
+#elif defined(__GNUC__)
+#  define MUNIT_NO_RETURN __attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+#  define MUNIT_NO_RETURN __declspec(noreturn)
+#else
+#  define MUNIT_NO_RETURN
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1500)
+#  define MUNIT_PUSH_DISABLE_MSVC_C4127_                                       \
+    __pragma(warning(push)) __pragma(warning(disable : 4127))
+#  define MUNIT_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop))
+#else
+#  define MUNIT_PUSH_DISABLE_MSVC_C4127_
+#  define MUNIT_POP_DISABLE_MSVC_C4127_
+#endif
+
+typedef enum {
+  MUNIT_LOG_DEBUG,
+  MUNIT_LOG_INFO,
+  MUNIT_LOG_WARNING,
+  MUNIT_LOG_ERROR
+} MunitLogLevel;
+
+#if defined(__GNUC__) && !defined(__MINGW32__)
+#  define MUNIT_PRINTF(string_index, first_to_check)                           \
+    __attribute__((format(printf, string_index, first_to_check)))
+#else
+#  define MUNIT_PRINTF(string_index, first_to_check)
+#endif
+
+MUNIT_PRINTF(4, 5)
+void munit_logf_ex(MunitLogLevel level, const char *filename, int line,
+                   const char *format, ...);
+
+#define munit_logf(level, format, ...)                                         \
+  munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__)
+
+#define munit_log(level, msg) munit_logf(level, "%s", msg)
+
+MUNIT_NO_RETURN
+MUNIT_PRINTF(3, 4)
+void munit_errorf_ex(const char *filename, int line, const char *format, ...);
+
+#define munit_errorf(format, ...)                                              \
+  munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__)
+
+#define munit_error(msg) munit_errorf("%s", msg)
+
+#define munit_assert(expr)                                                     \
+  do {                                                                         \
+    if (!MUNIT_LIKELY(expr)) {                                                 \
+      munit_error("assertion failed: " #expr);                                 \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_true(expr)                                                \
+  do {                                                                         \
+    if (!MUNIT_LIKELY(expr)) {                                                 \
+      munit_error("assertion failed: " #expr " is not true");                  \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_false(expr)                                               \
+  do {                                                                         \
+    if (!MUNIT_LIKELY(!(expr))) {                                              \
+      munit_error("assertion failed: " #expr " is not false");                 \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b)               \
+  do {                                                                         \
+    T munit_tmp_a_ = (a);                                                      \
+    T munit_tmp_b_ = (b);                                                      \
+    if (!(munit_tmp_a_ op munit_tmp_b_)) {                                     \
+      munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix        \
+                   " %s " prefix "%" fmt suffix ")",                           \
+                   #a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_);              \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_type(T, fmt, a, op, b)                                    \
+  munit_assert_type_full("", "", T, fmt, a, op, b)
+
+#define munit_assert_char(a, op, b)                                            \
+  munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a,   \
+                         op, b)
+#define munit_assert_uchar(a, op, b)                                           \
+  munit_assert_type_full("'\\x", "'", unsigned char,                           \
+                         "02" MUNIT_CHAR_MODIFIER "x", a, op, b)
+#define munit_assert_short(a, op, b)                                           \
+  munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b)
+#define munit_assert_ushort(a, op, b)                                          \
+  munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b)
+#define munit_assert_int(a, op, b) munit_assert_type(int, "d", a, op, b)
+#define munit_assert_uint(a, op, b)                                            \
+  munit_assert_type(unsigned int, "u", a, op, b)
+#define munit_assert_long(a, op, b) munit_assert_type(long int, "ld", a, op, b)
+#define munit_assert_ulong(a, op, b)                                           \
+  munit_assert_type(unsigned long int, "lu", a, op, b)
+#define munit_assert_llong(a, op, b)                                           \
+  munit_assert_type(long long int, "lld", a, op, b)
+#define munit_assert_ullong(a, op, b)                                          \
+  munit_assert_type(unsigned long long int, "llu", a, op, b)
+
+#define munit_assert_size(a, op, b)                                            \
+  munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b)
+#define munit_assert_ssize(a, op, b)                                           \
+  munit_assert_type(ssize_t, MUNIT_SIZE_MODIFIER "d", a, op, b)
+
+#define munit_assert_float(a, op, b) munit_assert_type(float, "f", a, op, b)
+#define munit_assert_double(a, op, b) munit_assert_type(double, "g", a, op, b)
+#define munit_assert_ptr(a, op, b)                                             \
+  munit_assert_type(const void *, "p", a, op, b)
+
+#define munit_assert_int8(a, op, b)                                            \
+  munit_assert_type(munit_int8_t, PRIi8, a, op, b)
+#define munit_assert_uint8(a, op, b)                                           \
+  munit_assert_type(munit_uint8_t, PRIu8, a, op, b)
+#define munit_assert_int16(a, op, b)                                           \
+  munit_assert_type(munit_int16_t, PRIi16, a, op, b)
+#define munit_assert_uint16(a, op, b)                                          \
+  munit_assert_type(munit_uint16_t, PRIu16, a, op, b)
+#define munit_assert_int32(a, op, b)                                           \
+  munit_assert_type(munit_int32_t, PRIi32, a, op, b)
+#define munit_assert_uint32(a, op, b)                                          \
+  munit_assert_type(munit_uint32_t, PRIu32, a, op, b)
+#define munit_assert_int64(a, op, b)                                           \
+  munit_assert_type(munit_int64_t, PRIi64, a, op, b)
+#define munit_assert_uint64(a, op, b)                                          \
+  munit_assert_type(munit_uint64_t, PRIu64, a, op, b)
+
+#define munit_assert_ptrdiff(a, op, b)                                         \
+  munit_assert_type(ptrdiff_t, "td", a, op, b)
+
+#define munit_assert_enum(T, a, op, b) munit_assert_type(T, "d", a, op, b)
+
+#define munit_assert_double_equal(a, b, precision)                             \
+  do {                                                                         \
+    const double munit_tmp_a_ = (a);                                           \
+    const double munit_tmp_b_ = (b);                                           \
+    const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0)         \
+                                     ? -(munit_tmp_a_ - munit_tmp_b_)          \
+                                     : (munit_tmp_a_ - munit_tmp_b_);          \
+    if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) {                    \
+      munit_errorf("assertion failed: %s == %s (%0." #precision                \
+                   "g == %0." #precision "g)",                                 \
+                   #a, #b, munit_tmp_a_, munit_tmp_b_);                        \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#include <string.h>
+#define munit_assert_string_equal(a, b)                                        \
+  do {                                                                         \
+    const char *munit_tmp_a_ = (a);                                            \
+    const char *munit_tmp_b_ = (b);                                            \
+    if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) {             \
+      munit_hexdump_diff(stderr, munit_tmp_a_, strlen(munit_tmp_a_),           \
+                         munit_tmp_b_, strlen(munit_tmp_b_));                  \
+      munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", #a, \
+                   #b, munit_tmp_a_, munit_tmp_b_);                            \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_string_not_equal(a, b)                                    \
+  do {                                                                         \
+    const char *munit_tmp_a_ = (a);                                            \
+    const char *munit_tmp_b_ = (b);                                            \
+    if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) {             \
+      munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", #a, \
+                   #b, munit_tmp_a_, munit_tmp_b_);                            \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_memory_equal(size, a, b)                                  \
+  do {                                                                         \
+    const unsigned char *munit_tmp_a_ = (const unsigned char *)(a);            \
+    const unsigned char *munit_tmp_b_ = (const unsigned char *)(b);            \
+    const size_t munit_tmp_size_ = (size);                                     \
+    if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != \
+        0) {                                                                   \
+      size_t munit_tmp_pos_;                                                   \
+      for (munit_tmp_pos_ = 0; munit_tmp_pos_ < munit_tmp_size_;               \
+           munit_tmp_pos_++) {                                                 \
+        if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) {    \
+          munit_hexdump_diff(stderr, munit_tmp_a_, size, munit_tmp_b_, size);  \
+          munit_errorf("assertion failed: memory %s == %s, at offset "         \
+                       "%" MUNIT_SIZE_MODIFIER "u",                            \
+                       #a, #b, munit_tmp_pos_);                                \
+          break;                                                               \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_memn_equal(a, a_size, b, b_size)                          \
+  do {                                                                         \
+    const unsigned char *munit_tmp_a_ = (const unsigned char *)(a);            \
+    const unsigned char *munit_tmp_b_ = (const unsigned char *)(b);            \
+    const size_t munit_tmp_a_size_ = (a_size);                                 \
+    const size_t munit_tmp_b_size_ = (b_size);                                 \
+    if (MUNIT_UNLIKELY(munit_tmp_a_size_ != munit_tmp_b_size_) ||              \
+        MUNIT_UNLIKELY(munit_tmp_a_size_ && memcmp(munit_tmp_a_, munit_tmp_b_, \
+                                                   munit_tmp_a_size_)) != 0) { \
+      munit_hexdump_diff(stderr, munit_tmp_a_, munit_tmp_a_size_,              \
+                         munit_tmp_b_, munit_tmp_b_size_);                     \
+      munit_errorf("assertion failed: memory %s == %s", #a, #b);               \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_memory_not_equal(size, a, b)                              \
+  do {                                                                         \
+    const unsigned char *munit_tmp_a_ = (const unsigned char *)(a);            \
+    const unsigned char *munit_tmp_b_ = (const unsigned char *)(b);            \
+    const size_t munit_tmp_size_ = (size);                                     \
+    if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == \
+        0) {                                                                   \
+      munit_errorf("assertion failed: memory %s != %s (%zu bytes)", #a, #b,    \
+                   munit_tmp_size_);                                           \
+    }                                                                          \
+    MUNIT_PUSH_DISABLE_MSVC_C4127_                                             \
+  } while (0) MUNIT_POP_DISABLE_MSVC_C4127_
+
+#define munit_assert_ptr_equal(a, b) munit_assert_ptr(a, ==, b)
+#define munit_assert_ptr_not_equal(a, b) munit_assert_ptr(a, !=, b)
+#define munit_assert_null(ptr) munit_assert_ptr(ptr, ==, NULL)
+#define munit_assert_not_null(ptr) munit_assert_ptr(ptr, !=, NULL)
+#define munit_assert_ptr_null(ptr) munit_assert_ptr(ptr, ==, NULL)
+#define munit_assert_ptr_not_null(ptr) munit_assert_ptr(ptr, !=, NULL)
+
+/*** Memory allocation ***/
+
+void *munit_malloc_ex(const char *filename, int line, size_t size);
+
+#define munit_malloc(size) munit_malloc_ex(__FILE__, __LINE__, (size))
+
+#define munit_new(type) ((type *)munit_malloc(sizeof(type)))
+
+#define munit_calloc(nmemb, size) munit_malloc((nmemb) * (size))
+
+#define munit_newa(type, nmemb) ((type *)munit_calloc((nmemb), sizeof(type)))
+
+/*** Random number generation ***/
+
+void munit_rand_seed(munit_uint32_t seed);
+munit_uint32_t munit_rand_uint32(void);
+int munit_rand_int_range(int min, int max);
+double munit_rand_double(void);
+void munit_rand_memory(size_t size, munit_uint8_t *buffer);
+
+/*** Tests and Suites ***/
+
+typedef enum {
+  /* Test successful */
+  MUNIT_OK,
+  /* Test failed */
+  MUNIT_FAIL,
+  /* Test was skipped */
+  MUNIT_SKIP,
+  /* Test failed due to circumstances not intended to be tested
+   * (things like network errors, invalid parameter value, failure to
+   * allocate memory in the test harness, etc.). */
+  MUNIT_ERROR
+} MunitResult;
+
+typedef struct {
+  char *name;
+  char **values;
+} MunitParameterEnum;
+
+typedef struct {
+  char *name;
+  char *value;
+} MunitParameter;
+
+const char *munit_parameters_get(const MunitParameter params[],
+                                 const char *key);
+
+typedef enum {
+  MUNIT_TEST_OPTION_NONE = 0,
+  MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0,
+  MUNIT_TEST_OPTION_TODO = 1 << 1
+} MunitTestOptions;
+
+typedef MunitResult (*MunitTestFunc)(const MunitParameter params[],
+                                     void *user_data_or_fixture);
+typedef void *(*MunitTestSetup)(const MunitParameter params[], void *user_data);
+typedef void (*MunitTestTearDown)(void *fixture);
+
+typedef struct {
+  const char *name;
+  MunitTestFunc test;
+  MunitTestSetup setup;
+  MunitTestTearDown tear_down;
+  MunitTestOptions options;
+  MunitParameterEnum *parameters;
+} MunitTest;
+
+typedef enum { MUNIT_SUITE_OPTION_NONE = 0 } MunitSuiteOptions;
+
+typedef struct MunitSuite_ MunitSuite;
+
+struct MunitSuite_ {
+  const char *prefix;
+  const MunitTest *tests;
+  const MunitSuite *suites;
+  unsigned int iterations;
+  MunitSuiteOptions options;
+};
+
+int munit_suite_main(const MunitSuite *suite, void *user_data, int argc,
+                     char *const *argv);
+
+/* Note: I'm not very happy with this API; it's likely to change if I
+ * figure out something better.  Suggestions welcome. */
+
+typedef struct MunitArgument_ MunitArgument;
+
+struct MunitArgument_ {
+  char *name;
+  munit_bool (*parse_argument)(const MunitSuite *suite, void *user_data,
+                               int *arg, int argc, char *const *argv);
+  void (*write_help)(const MunitArgument *argument, void *user_data);
+};
+
+int munit_suite_main_custom(const MunitSuite *suite, void *user_data, int argc,
+                            char *const *argv, const MunitArgument arguments[]);
+
+#if defined(MUNIT_ENABLE_ASSERT_ALIASES)
+
+#  define assert_true(expr) munit_assert_true(expr)
+#  define assert_false(expr) munit_assert_false(expr)
+#  define assert_char(a, op, b) munit_assert_char(a, op, b)
+#  define assert_uchar(a, op, b) munit_assert_uchar(a, op, b)
+#  define assert_short(a, op, b) munit_assert_short(a, op, b)
+#  define assert_ushort(a, op, b) munit_assert_ushort(a, op, b)
+#  define assert_int(a, op, b) munit_assert_int(a, op, b)
+#  define assert_uint(a, op, b) munit_assert_uint(a, op, b)
+#  define assert_long(a, op, b) munit_assert_long(a, op, b)
+#  define assert_ulong(a, op, b) munit_assert_ulong(a, op, b)
+#  define assert_llong(a, op, b) munit_assert_llong(a, op, b)
+#  define assert_ullong(a, op, b) munit_assert_ullong(a, op, b)
+#  define assert_size(a, op, b) munit_assert_size(a, op, b)
+#  define assert_ssize(a, op, b) munit_assert_ssize(a, op, b)
+#  define assert_float(a, op, b) munit_assert_float(a, op, b)
+#  define assert_double(a, op, b) munit_assert_double(a, op, b)
+#  define assert_ptr(a, op, b) munit_assert_ptr(a, op, b)
+
+#  define assert_int8(a, op, b) munit_assert_int8(a, op, b)
+#  define assert_uint8(a, op, b) munit_assert_uint8(a, op, b)
+#  define assert_int16(a, op, b) munit_assert_int16(a, op, b)
+#  define assert_uint16(a, op, b) munit_assert_uint16(a, op, b)
+#  define assert_int32(a, op, b) munit_assert_int32(a, op, b)
+#  define assert_uint32(a, op, b) munit_assert_uint32(a, op, b)
+#  define assert_int64(a, op, b) munit_assert_int64(a, op, b)
+#  define assert_uint64(a, op, b) munit_assert_uint64(a, op, b)
+
+#  define assert_ptrdiff(a, op, b) munit_assert_ptrdiff(a, op, b)
+
+#  define assert_enum(T, a, op, b) munit_assert_enum(T, a, op, b)
+
+#  define assert_double_equal(a, b, precision)                                 \
+    munit_assert_double_equal(a, b, precision)
+#  define assert_string_equal(a, b) munit_assert_string_equal(a, b)
+#  define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b)
+#  define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b)
+#  define assert_memn_equal(a, a_size, b, b_size)                              \
+    munit_assert_memn_equal(a, a_size, b, b_size)
+#  define assert_memory_not_equal(size, a, b)                                  \
+    munit_assert_memory_not_equal(size, a, b)
+#  define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b)
+#  define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b)
+#  define assert_ptr_null(ptr) munit_assert_null_equal(ptr)
+#  define assert_ptr_not_null(ptr) munit_assert_not_null(ptr)
+
+#  define assert_null(ptr) munit_assert_null(ptr)
+#  define assert_not_null(ptr) munit_assert_not_null(ptr)
+
+#endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */
+
+#define munit_void_test_decl(func)                                             \
+  void func(void);                                                             \
+                                                                               \
+  static inline MunitResult wrap_##func(const MunitParameter params[],         \
+                                        void *fixture) {                       \
+    (void)params;                                                              \
+    (void)fixture;                                                             \
+                                                                               \
+    func();                                                                    \
+    return MUNIT_OK;                                                           \
+  }
+
+#define munit_void_test(func)                                                  \
+  {"/" #func, wrap_##func, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}
+
+#define munit_test_end() {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}
+
+int munit_hexdump(FILE *fp, const void *data, size_t datalen);
+
+int munit_hexdump_diff(FILE *fp, const void *a, size_t alen, const void *b,
+                       size_t blen);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !defined(MUNIT_H) */
+
+#if defined(MUNIT_ENABLE_ASSERT_ALIASES)
+#if defined(assert)
+#  undef assert
+#endif
+#define assert(expr) munit_assert(expr)
+#endif
diff --git a/tests/unit/unit.c b/tests/unit/unit.c
new file mode 100644
index 00000000000..81b2e93975f
--- /dev/null
+++ b/tests/unit/unit.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2026, TrueNAS.
+ */
+
+/* Core stubs, applicable to all test suites. */
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_debug.h>
+
+#include "munit.h"
+
+/*
+ * SET_ERROR() expands to __set_error() in debug builds. It's an
+ * under-the-hood tracing aid in production; a no-op is fine.
+ */
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+	(void) file; (void) func; (void) line; (void) err;
+}
+
+/* Plumb logging and debug into munit for convenience. */
+
+/* dprintf() checks zfs_flags and calls __dprintf() in debug builds. */
+int zfs_dbgmsg_enable = 1;
+int zfs_flags = ZFS_DEBUG_DPRINTF;
+
+/* Log dprintf() to MUNIT_LOG_DEBUG. */
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+    int line, const char *fmt, ...)
+{
+	char buf[1024];
+
+	va_list ap;
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof (buf), fmt, ap);
+	va_end(ap);
+
+	munit_logf_ex(MUNIT_LOG_DEBUG, NULL, 0, "%s%s:%d [%s]: %s",
+	    dprint ? "dprintf: " : "", file, line, func, buf);
+}
+
+/* Log cmn_err() to MUNIT_LOG_INFO or WARNING, abort test on CE_PANIC. */
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+	if (ce == CE_IGNORE)
+		return;
+
+	char buf[1024];
+
+	va_list ap;
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof (buf), fmt, ap);
+	va_end(ap);
+
+	switch (ce) {
+	case CE_WARN:
+		munit_logf_ex(MUNIT_LOG_WARNING, NULL, 0, "%s", buf);
+		break;
+	case CE_PANIC:
+		munit_errorf_ex(NULL, 0, "PANIC: %s", buf);
+		break;
+	default:
+		munit_logf_ex(MUNIT_LOG_INFO, NULL, 0, "%s", buf);
+		break;
+	}
+}
diff --git a/tests/unit/unit.h b/tests/unit/unit.h
new file mode 100644
index 00000000000..6b655082092
--- /dev/null
+++ b/tests/unit/unit.h
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2026, TrueNAS.
+ */
+
+#ifndef UNIT_H
+#define	UNIT_H
+
+#include "munit.h"
+
+/* test/suite definition helpers */
+
+/* single element in a MunitTest array */
+#define	_UNIT_TEST(name, func, params, ...)				\
+	{ (name), (func), NULL, NULL, MUNIT_TEST_OPTION_NONE,	\
+	(MunitParameterEnum*)(params)  }
+#define	UNIT_TEST(name, func, ...)				\
+	_UNIT_TEST(name, func, ##__VA_ARGS__, NULL)
+
+/* single element in a MunitParameterEnum array */
+#define	UNIT_PARAM(name, ...)	\
+	{ (char *)(name), (char **)(const char *[]) { __VA_ARGS__, NULL } }
+
+/* shortcut for truthy tests */
+#define	unit_true(a)	munit_assert_true(a)
+#define	unit_false(a)	munit_assert_false(a)
+
+/* shortcut for zero test */
+#define	unit_zero(a)	munit_assert_uint64((a), ==, 0)
+
+/* shortcuts for integer comparisons */
+#define	_unit_op(a, op, b)	munit_assert_uint64((a), op, (b))
+
+#define	unit_eq(a, b)	_unit_op((a), ==, (b))
+#define	unit_ne(a, b)	_unit_op((a), !=, (b))
+#define	unit_le(a, b)	_unit_op((a), <=, (b))
+#define	unit_ge(a, b)	_unit_op((a), >=, (b))
+#define	unit_lt(a, b)	_unit_op((a), <,  (b))
+#define	unit_gt(a, b)	_unit_op((a), >,  (b))
+
+/* shortcuts for string comparisons */
+#define	unit_str_eq(a, b)	munit_assert_string_equal(a, b)
+#define	unit_str_ne(a, b)	munit_assert_string_not_equal(a, b)
+
+/* shortcuts for error-returning function call */
+#define	unit_ok(a)	munit_assert_int((a), ==, 0)
+#define	unit_err(a, e)	munit_assert_int((a), ==, (e))
+
+#endif /* UNIT_H */

From a20ef9c4e7035b2686aed762c20f2c8772b12dae Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 2 May 2026 17:37:16 +1000
Subject: [PATCH 067/129] unit: dnode/dbuf/dmu_tx mocks

Some simple initial mock for key DMU structures. It's hard to say this
early how generalisable these are, however they are enough for the ZAP
unit tests (next commit).

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18564
---
 tests/unit/Makefile.am |   2 +
 tests/unit/mock_dmu.c  | 393 +++++++++++++++++++++++++++++++++++++++++
 tests/unit/mock_dmu.h  |  47 +++++
 3 files changed, 442 insertions(+)
 create mode 100644 tests/unit/mock_dmu.c
 create mode 100644 tests/unit/mock_dmu.h

diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index 1ba1258143e..25c1c0788cc 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -5,6 +5,8 @@ libunit_la_CFLAGS = $(AM_CFLAGS)
 
 noinst_LTLIBRARIES += libunit.la
 libunit_la_SOURCES = \
+	%D%/mock_dmu.c \
+	%D%/mock_dmu.h \
 	%D%/munit.c \
 	%D%/munit.h \
 	%D%/unit.c \
diff --git a/tests/unit/mock_dmu.c b/tests/unit/mock_dmu.c
new file mode 100644
index 00000000000..65c38c1fd9f
--- /dev/null
+++ b/tests/unit/mock_dmu.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2026, TrueNAS.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zfeature.h>
+
+#include "mock_dmu.h"
+
+/*
+ * A mock dbuf. A real dmu_buf_t (first for casting) plus the attached user
+ * data pointer. Block data is stored in a separate allocation so that the
+ * struct address remains stable across block resizes.
+ */
+struct mock_dbuf {
+	dmu_buf_t		mdb_db;
+	dmu_buf_user_t		*mdb_user;
+	mock_dnode_t		*mdb_owner;
+	void			*mdb_data;
+};
+typedef struct mock_dbuf mock_dbuf_t;
+
+/*
+ * A mock dnode. a real dnode_t (must be first for casting) with dn_type
+ * and dn_object set, plus a flat array of mock_dbuf_t indexed by block id.
+ */
+struct mock_dnode {
+	dnode_t			mdn_dn;
+	size_t			mdn_blksize;
+	size_t			mdn_nblocks;
+	mock_dbuf_t		**mdn_blocks;
+};
+
+/*
+ * A mock transaction. We only allocate and zero it, nothing currently uses
+ * any of its internals.
+ */
+struct mock_dmu_tx {
+	dmu_tx_t		mtx_tx;
+};
+
+/* Mock dnode */
+
+static mock_dbuf_t *
+mock_dnode_block_alloc(mock_dnode_t *mdn, uint64_t blkid)
+{
+	mock_dbuf_t *mdb = kmem_zalloc(sizeof (mock_dbuf_t), KM_SLEEP);
+	mdb->mdb_data = kmem_zalloc(mdn->mdn_blksize, KM_SLEEP);
+
+	mdb->mdb_db.db_object = mdn->mdn_dn.dn_object;
+	mdb->mdb_db.db_offset = blkid * mdn->mdn_blksize;
+	mdb->mdb_db.db_size   = mdn->mdn_blksize;
+	mdb->mdb_db.db_data   = mdb->mdb_data;
+	mdb->mdb_owner = mdn;
+
+	return (mdb);
+}
+
+/* Grow the dbuf array if needed, then return (or create) the dbuf for blkid. */
+static mock_dbuf_t *
+mock_dnode_block_get(mock_dnode_t *mdn, uint64_t blkid)
+{
+	if (blkid >= mdn->mdn_nblocks) {
+		size_t new_n = blkid + 1;
+		mock_dbuf_t **new_blocks =
+		    kmem_zalloc(new_n * sizeof (mock_dbuf_t *), KM_SLEEP);
+		if (mdn->mdn_blocks != NULL) {
+			memcpy(new_blocks, mdn->mdn_blocks,
+			    mdn->mdn_nblocks * sizeof (mock_dbuf_t *));
+			kmem_free(mdn->mdn_blocks,
+			    mdn->mdn_nblocks * sizeof (mock_dbuf_t *));
+		}
+		mdn->mdn_blocks = new_blocks;
+		mdn->mdn_nblocks = new_n;
+	}
+
+	mock_dbuf_t *mdb = mdn->mdn_blocks[blkid];
+	if (mdb == NULL) {
+		mdb = mock_dnode_block_alloc(mdn, blkid);
+		mdn->mdn_blocks[blkid] = mdb;
+	}
+	return (mdb);
+}
+
+mock_dnode_t *
+mock_dnode_create(size_t blksize, dmu_object_type_t type)
+{
+	ASSERT(IS_P2ALIGNED(blksize, 512));
+
+	mock_dnode_t *mdn = kmem_zalloc(sizeof (mock_dnode_t), KM_SLEEP);
+	mdn->mdn_dn.dn_type = type;
+	mdn->mdn_dn.dn_object = 1;	/* arbitrary non-zero object number */
+	mdn->mdn_blksize = blksize;
+
+	return (mdn);
+}
+
+void
+mock_dnode_destroy(mock_dnode_t *mdn)
+{
+	for (size_t i = 0; i < mdn->mdn_nblocks; i++) {
+		mock_dbuf_t *mdb = mdn->mdn_blocks[i];
+		if (mdb == NULL)
+			continue;
+
+		/*
+		 * Call the sync evict callback if one is set, mimicking the
+		 * real DMU when a buffer's refcount drops to zero.
+		 */
+		if (mdb->mdb_user != NULL &&
+		    mdb->mdb_user->dbu_evict_func_sync != NULL)
+			mdb->mdb_user->dbu_evict_func_sync(mdb->mdb_user);
+
+		kmem_free(mdb->mdb_data, mdb->mdb_db.db_size);
+		kmem_free(mdb, sizeof (mock_dbuf_t));
+	}
+
+	kmem_free(mdn->mdn_blocks,
+	    mdn->mdn_nblocks * sizeof (mock_dbuf_t *));
+	kmem_free(mdn, sizeof (mock_dnode_t));
+}
+
+size_t
+mock_dnode_block_count(mock_dnode_t *mdn)
+{
+	return (mdn->mdn_nblocks);
+}
+
+const void *
+mock_dnode_block_data(mock_dnode_t *mdn, uint64_t blkid)
+{
+	if (blkid >= mdn->mdn_nblocks)
+		return (NULL);
+	return (mdn->mdn_blocks[blkid]->mdb_db.db_data);
+}
+
+/* Mock transaction */
+
+mock_dmu_tx_t *
+mock_tx_create(void)
+{
+	return (kmem_zalloc(sizeof (mock_dmu_tx_t), KM_SLEEP));
+}
+
+void
+mock_tx_destroy(mock_dmu_tx_t *tx)
+{
+	kmem_free(tx, sizeof (mock_dmu_tx_t));
+}
+
+/* DMU stubs, either no-op or light access to mock dnode internals. */
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
+    dmu_buf_t **dbp, dmu_flags_t flags)
+{
+	(void) tag; (void) flags;
+
+	mock_dnode_t *mdn = (mock_dnode_t *)dn;
+	uint64_t blkid = offset / mdn->mdn_blksize;
+	mock_dbuf_t *mdb = mock_dnode_block_get(mdn, blkid);
+
+	*dbp = &mdb->mdb_db;
+	return (0);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *db, const void *tag)
+{
+	(void) db; (void) tag;
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db)
+{
+	mock_dbuf_t *mdb = (mock_dbuf_t *)db;
+	return (mdb->mdb_user);
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *new_user)
+{
+	mock_dbuf_t *mdb = (mock_dbuf_t *)db;
+	if (mdb->mdb_user != NULL)
+		return (mdb->mdb_user);	/* existing user wins */
+	mdb->mdb_user = new_user;
+	return (NULL);			/* new_user wins */
+}
+
+void
+dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx)
+{
+	(void) db; (void) tx;
+}
+
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+	mock_dbuf_t *mdb = (mock_dbuf_t *)db;
+
+	/*
+	 * We return the mock_dnode_t pointer cast to objset_t so that
+	 * dmu_object_set_blocksize() below can recover the dnode without
+	 * needing a separate objset structure.
+	 */
+	return ((objset_t *)mdb->mdb_owner);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+    int ibs, dmu_tx_t *tx)
+{
+	(void) object; (void) ibs; (void) tx;
+
+	/* os is a mock_dnode_t (see dmu_buf_get_objset() above). */
+	mock_dnode_t *mdn = (mock_dnode_t *)os;
+
+	/*
+	 * Resize block 0's data buffer in place so the struct address stays
+	 * stable.
+	 */
+	mock_dbuf_t *mdb = mdn->mdn_blocks[0];
+	void *new_data = kmem_zalloc(size, KM_SLEEP);
+	memcpy(new_data, mdb->mdb_data,
+	    MIN(size, (size_t)mdb->mdb_db.db_size));
+	kmem_free(mdb->mdb_data, mdb->mdb_db.db_size);
+
+	mdb->mdb_data = new_data;
+	mdb->mdb_db.db_size = size;
+	mdb->mdb_db.db_data = new_data;
+	mdn->mdn_blksize = size;
+
+	return (0);
+}
+
+boolean_t
+dnode_add_ref(dnode_t *dn, const void *tag)
+{
+	(void) dn; (void) tag;
+	return (B_TRUE);
+}
+
+void
+dnode_rele(dnode_t *dn, const void *tag)
+{
+	(void) dn; (void) tag;
+}
+
+/*
+ * Misc other stubs. Not strictly DMU mocks, and might move elsewhere later,
+ * but for now this is all we need for our limited test set.
+ */
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+	(void) os;
+	return (NULL);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, dmu_tx_t *tx)
+{
+	(void) os; (void) object; (void) offset; (void) size; (void) tx;
+	return (0);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	(void) dn; (void) level; (void) offset; (void) len; (void) pri;
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+	(void) os;
+	return (NULL);
+}
+
+boolean_t
+dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f)
+{
+	(void) ds; (void) f;
+	return (B_FALSE);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	(void) ds; (void) tx;
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, spa_feature_t f)
+{
+	(void) spa; (void) f;
+	return (B_FALSE);
+}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+	(void) spa;
+	return (SPA_OLD_MAXBLOCKSIZE);
+}
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+void
+byteswap_uint64_array(void *buf, size_t size)
+{
+	(void) buf; (void) size;
+}
+
+/*
+ * Various objset+object calls; returning error, as they need to use
+ * _by_dnode() variants to get the mock.
+ */
+int
+dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
+{
+	(void) os; (void) object; (void) tag; (void) dnp;
+	return (EIO);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	(void) os; (void) object; (void) tx;
+	return (EIO);
+}
+
+uint64_t
+dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot,
+    int blocksize, int indirect_blockshift, dmu_object_type_t bonustype,
+    int bonuslen, int dnodesize, dnode_t **allocated_dnode,
+    const void *tag, dmu_tx_t *tx)
+{
+	(void) os; (void) ot; (void) blocksize; (void) indirect_blockshift;
+	(void) bonustype; (void) bonuslen; (void) dnodesize;
+	(void) allocated_dnode; (void) tag; (void) tx;
+	return (EIO);
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+    int dnodesize, dmu_tx_t *tx)
+{
+	(void) os; (void) object; (void) ot; (void) blocksize;
+	(void) bonus_type; (void) bonus_len; (void) dnodesize; (void) tx;
+	return (EIO);
+}
+
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+	(void) os; (void) object; (void) doi;
+	return (EIO);
+}
+
+int
+dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t len)
+{
+	(void) os; (void) object; (void) offset; (void) len;
+	return (EIO);
+}
diff --git a/tests/unit/mock_dmu.h b/tests/unit/mock_dmu.h
new file mode 100644
index 00000000000..a46454c779f
--- /dev/null
+++ b/tests/unit/mock_dmu.h
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2026, TrueNAS.
+ */
+
+#ifndef _MOCK_DMU_H
+#define	_MOCK_DMU_H
+
+/*
+ * In-memory mock of the core DMU types for unit testing.
+ *
+ * Provides mock_dnode_t carrying a flat array of fixed-size blocks.
+ */
+
+#include <sys/types.h>
+
+typedef struct mock_dnode mock_dnode_t;
+typedef struct mock_dmu_tx mock_dmu_tx_t;
+
+/* Create a mock dnode with the given block size and object type. */
+mock_dnode_t *mock_dnode_create(size_t blksize, dmu_object_type_t type);
+
+/* Free a mock dnode and all its blocks. */
+void mock_dnode_destroy(mock_dnode_t *mdn);
+
+/* Returns the current number of blocks underlying this dnode. */
+size_t mock_dnode_block_count(mock_dnode_t *mdn);
+
+/* Returns a pointer to the data under the given block id. */
+const void *mock_dnode_block_data(mock_dnode_t *mdn, uint64_t blkid);
+
+/* Create/destroy a mock transaction handle. */
+mock_dmu_tx_t *mock_tx_create(void);
+void mock_tx_destroy(mock_dmu_tx_t *tx);
+
+#endif /* _MOCK_DMU_H */

From 1d601eb83b1b849edba047feae5137f0adb93ee2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Tue, 5 May 2026 12:44:11 +1000
Subject: [PATCH 068/129] unit/test_zap: a trivial ZAP unit test suite

This commit adds the bones of a unit test suite for the ZAP subsystem.
The actual tests themselves don't do much, just ZAP creation and
destruction and basic KV ops. At this point its intended to be enough to
demonstrate what tests under this framework would look like.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18564
---
 tests/unit/.gitignore  |   2 +
 tests/unit/Makefile.am |  24 +++-
 tests/unit/test_zap.c  | 273 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 298 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_zap.c

diff --git a/tests/unit/.gitignore b/tests/unit/.gitignore
index 52315f0b5fa..12a60a65666 100644
--- a/tests/unit/.gitignore
+++ b/tests/unit/.gitignore
@@ -1,2 +1,4 @@
 /test_*.info
 /test_*_coverage
+
+/test_zap
diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index 25c1c0788cc..43b7ddb84ca 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -14,10 +14,32 @@ libunit_la_SOURCES = \
 
 
 # all test binaries
-UNIT_TESTS =
+UNIT_TESTS = \
+	%D%/test_zap
 noinst_PROGRAMS = $(UNIT_TESTS)
 
 
+%C%_test_zap_CFLAGS = $(AM_CFLAGS)
+
+nodist_%C%_test_zap_SOURCES = \
+	module/zfs/zap.c \
+	module/zfs/zap_fat.c \
+	module/zfs/zap_impl.c \
+	module/zfs/zap_micro.c \
+	module/zfs/zap_leaf.c \
+	module/zfs/u8_textprep.c
+
+%C%_test_zap_SOURCES = \
+	%D%/test_zap.c
+
+%C%_test_zap_LDADD = \
+	libspl.la \
+	libbtree.la \
+	libunit.la
+
+%C%_test_zap_LDFLAGS = -pthread
+
+
 # test run and coverage targets below
 PHONY += unit unit-coverage
 
diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
new file mode 100644
index 00000000000..d8ec4288ec1
--- /dev/null
+++ b/tests/unit/test_zap.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2026, TrueNAS.
+ */
+
+#include <stdbool.h>
+
+#include <sys/zap.h>
+#include <sys/btree.h>
+typedef struct spa spa_t;	/* forward decl for zap_impl.h */
+#include <sys/zap_impl.h>
+
+#include "mock_dmu.h"
+#include "unit.h"
+
+/* ========== */
+
+/*
+ * Normally defined and initialised in arc.c.  We define and initialise it
+ * ourselves here so this mock can be linked without arc.c.
+ */
+uint64_t zfs_crc64_table[256];
+
+static void
+mock_crc64_init(void)
+{
+	for (int i = 0; i < 256; i++) {
+		uint64_t ct = i;
+		for (int j = 8; j > 0; j--)
+			ct = (ct >> 1) ^ (-(ct & 1) & ZFS_CRC64_POLY);
+		zfs_crc64_table[i] = ct;
+	}
+}
+
+/* Misc utility functions. */
+
+#define	rd64(ptr, off)	(*(uint64_t *)((const char *)(ptr) + (off)))
+
+/* ========== */
+
+/* ZAP-specific mocks and other test helpers. */
+
+/* Create a microzap backed by a mock dnode. */
+static dnode_t *
+mock_zap_create_microzap(void) {
+	/*
+	 * We use DMU_OTN_ZAP_DATA so that DMU_OT_BYTESWAP() returns
+	 * DMU_BSWAP_ZAP without consulting dmu_ot[], which is not currently
+	 * provided in the mock.
+	 */
+	mock_dnode_t *mdn = mock_dnode_create(512, DMU_OTN_ZAP_DATA);
+	dnode_t *dn = (dnode_t *)mdn;
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+	mzap_create_impl(dn, 0, 0, tx);
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	return (dn);
+}
+
+/* Create a fatzap backed by a mock dnode. */
+static dnode_t *
+mock_zap_create_fatzap(void)
+{
+	/*
+	 * We can only create microzaps directly. They only take u64s as a
+	 * value, so we add a u16 to trigger an upgrade to fatzap.
+	 */
+	dnode_t *dn = mock_zap_create_microzap();
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+	uint16_t upgrade = 0;
+	zap_add_by_dnode(dn, "_upgrade", sizeof (uint16_t), 1, &upgrade, tx);
+	zap_remove_by_dnode(dn, "_upgrade", tx);
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	return (dn);
+}
+
+static bool
+mock_zap_is_microzap(dnode_t *dn)
+{
+	/* check block 0 has a microzap header */
+	const void *blk = mock_dnode_block_data((mock_dnode_t *)dn, 0);
+	return (rd64(blk, 0) == ZBT_MICRO);
+}
+
+static bool
+mock_zap_is_fatzap(dnode_t *dn)
+{
+	/* check block 0 has a fatzap header */
+	const void *blk = mock_dnode_block_data((mock_dnode_t *)dn, 0);
+	return (rd64(blk, 0) == ZBT_HEADER && rd64(blk, 8) == ZAP_MAGIC);
+}
+
+static void
+mock_zap_destroy(dnode_t *dn)
+{
+	mock_dnode_destroy((mock_dnode_t *)dn);
+}
+
+/* Create a ZAP of the type named in the given test params. */
+static dnode_t *
+mock_zap_create_params(const MunitParameter params[], const char *key) {
+	const char *type = munit_parameters_get(params, key);
+	if (type == NULL)
+		munit_error("mock_zap_create_params: missing type param");
+	else if (strcmp(type, "micro") == 0)
+		return (mock_zap_create_microzap());
+	else if (strcmp(type, "fat") == 0)
+		return (mock_zap_create_fatzap());
+	else
+		munit_errorf("mock_zap_create_params: invalid type '%s'", type);
+	__builtin_unreachable();
+}
+
+/*
+ * Confirm the stored ZAP is of the type named in the given test params. This
+ * is useful for sanity checks within tests that a ZAP wasn't unexpectedly
+ * upgraded during the test.
+ */
+static bool
+mock_zap_is_params(dnode_t *dn, const MunitParameter params[],
+    const char *key)
+{
+	const char *type = munit_parameters_get(params, key);
+	if (type == NULL)
+		munit_error("mock_zap_is_params: missing type param");
+	else if (strcmp(type, "micro") == 0)
+		return (mock_zap_is_microzap(dn));
+	else if (strcmp(type, "fat") == 0)
+		return (mock_zap_is_fatzap(dn));
+	else
+		munit_errorf("mock_zap_is_params: invalid type '%s'", type);
+	__builtin_unreachable();
+}
+
+/* ========== */
+
+/*
+ * Sanity checks for mock ZAPs. Ensures that the mock_zap_create_* functions
+ * really do create the right kind of ZAPs, since many of the tests need to
+ * run against both kinds to confirm that they all work the same way.
+ */
+static MunitResult
+test_mock_microzap_sanity(const MunitParameter params[], void *data)
+{
+	(void) params, (void) data;
+
+	dnode_t *dn = mock_zap_create_microzap();
+	unit_true(mock_zap_is_microzap(dn));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+static MunitResult
+test_mock_fatzap_sanity(const MunitParameter params[], void *data)
+{
+	(void) params, (void) data;
+
+	dnode_t *dn = mock_zap_create_fatzap();
+	unit_true(mock_zap_is_fatzap(dn));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
+/*
+ * A simple add, lookup and remove test. Confirms basic operation. These are
+ * tested together simply because all other tests rely on these primitives.
+ */
+static MunitResult
+test_zap_basic(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* Insert a few entries. */
+	uint64_t val42 = 42;
+	uint64_t val99 = 99;
+	uint64_t val0  = 0;
+
+	unit_ok(zap_add_by_dnode(dn, "hello",
+	    sizeof (uint64_t), 1, &val42, tx));
+	unit_ok(zap_add_by_dnode(dn, "world",
+	    sizeof (uint64_t), 1, &val99, tx));
+	unit_ok(zap_add_by_dnode(dn, "zero",
+	    sizeof (uint64_t), 1, &val0, tx));
+
+	/* Lookup each entry. */
+	uint64_t result = 0;
+	unit_ok(zap_lookup_by_dnode(dn, "hello",
+	    sizeof (uint64_t), 1, &result));
+	unit_eq(result, 42);
+
+	unit_ok(zap_lookup_by_dnode(dn, "world",
+	    sizeof (uint64_t), 1, &result));
+	unit_eq(result, 99);
+
+	unit_ok(zap_lookup_by_dnode(dn, "zero",
+	    sizeof (uint64_t), 1, &result));
+	unit_eq(result, 0);
+
+	/* Non-existent key should return ENOENT. */
+	unit_err(zap_lookup_by_dnode(dn, "nope",
+	    sizeof (uint64_t), 1, &result), ENOENT);
+
+	/* Removing an entry should make it impossible to look up. */
+	unit_ok(zap_remove_by_dnode(dn, "world", tx));
+	unit_err(zap_lookup_by_dnode(dn, "world",
+	    sizeof (uint64_t), 1, &result), ENOENT);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
+/* Test suite definition and boilerplate. */
+
+#define	UNIT_PARAM_ZAP_TYPES(p)	\
+	UNIT_PARAM((p), "micro", "fat")
+
+static const MunitParameterEnum zap_type_params[] = {
+	UNIT_PARAM_ZAP_TYPES("type"),
+	{ 0 },
+};
+
+static const MunitTest zap_tests[] = {
+	UNIT_TEST("mock_microzap_sanity",	test_mock_microzap_sanity),
+	UNIT_TEST("mock_fatzap_sanity",		test_mock_fatzap_sanity),
+
+	UNIT_TEST("zap_basic",	test_zap_basic,	zap_type_params),
+
+	{ 0 },
+};
+
+static const MunitSuite zap_test_suite = {
+	"zap.",
+	zap_tests,
+	NULL,
+	1,
+	MUNIT_SUITE_OPTION_NONE,
+};
+
+int
+main(int argc, char **argv)
+{
+	mock_crc64_init();
+
+	zap_init();
+
+	int rc = munit_suite_main(&zap_test_suite, NULL, argc, argv);
+
+	zap_fini();
+
+	return (rc);
+}

From 8f6f4bcb544ca650fd3796f059dd209dbe0bafa2 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 22 May 2026 13:58:36 -0700
Subject: [PATCH 069/129] ZTS: update sanity.run file

Several of the tests included in the sanity.run file are no
longer quick.  In fact, the pyzfs tests can take over 5 minutes
to run which exceeds the allowed default timeout resulting the
the testing being killed.

Perform a little housekeeping and drop any test which takes more
than 10 seconds to run.  This brings things back a little closer
to the original intent of having a battery of useful test cases
which can be run in ~10 minutes.

ZFS-CI-Type: quick
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18576
---
 tests/runfiles/sanity.run | 33 ++++++---------------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index 0deaa038a31..788c9b39531 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -357,8 +357,7 @@ tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
-tests = ['zpool_split_cliargs', 'zpool_split_devices',
-    'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_indirect']
+tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_indirect']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
@@ -439,12 +438,6 @@ tags = ['functional', 'features', 'large_dnode']
 tests = ['gang_blocks_001_pos']
 tags = ['functional', 'gang_blocks']
 
-[tests/functional/grow]
-pre =
-post =
-tests = ['grow_pool_001_pos', 'grow_replicas_001_pos']
-tags = ['functional', 'grow']
-
 [tests/functional/history]
 tests = ['history_004_pos', 'history_005_neg', 'history_007_pos',
     'history_009_pos']
@@ -502,12 +495,6 @@ tags = ['functional', 'nestedfs']
 tests = ['nopwrite_sync', 'nopwrite_volume']
 tags = ['functional', 'nopwrite']
 
-[tests/functional/pool_checkpoint]
-tests = ['checkpoint_conf_change', 'checkpoint_discard_many',
-    'checkpoint_removal', 'checkpoint_sm_scale', 'checkpoint_twice']
-tags = ['functional', 'pool_checkpoint']
-timeout = 1800
-
 [tests/functional/poolversion]
 tests = ['poolversion_001_pos', 'poolversion_002_pos']
 tags = ['functional', 'poolversion']
@@ -557,13 +544,11 @@ tags = ['functional', 'reservation']
 
 [tests/functional/rsend]
 tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
-    'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos',
-    'rsend_006_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos',
-    'rsend_014_pos', 'rsend_016_neg', 'rsend-exclude_001_pos',
-    'rsend-exclude_002_pos', 'send-c_verify_contents',
-    'send-c_volume', 'send-c_zstreamdump', 'send-c_recv_dedup',
-    'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props',
-    'send_encrypted_freeobjects',
+    'rsend_002_pos', 'rsend_003_pos', 'rsend_009_pos', 'rsend_010_pos',
+    'rsend_011_pos', 'rsend_016_neg', 'rsend-exclude_001_pos',
+    'rsend-exclude_002_pos', 'send-c_volume', 'send-c_zstreamdump',
+    'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy',
+    'send_encrypted_props', 'send_encrypted_freeobjects',
     'send_encrypted_truncated_files', 'send_freeobjects', 'send_holds',
     'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset',
     'send_invalid']
@@ -644,9 +629,3 @@ tags = ['functional', 'zvol', 'zvol_swap']
 [tests/functional/zpool_influxdb]
 tests = ['zpool_influxdb']
 tags = ['functional', 'zpool_influxdb']
-
-[tests/functional/pyzfs]
-tests = ['pyzfs_unittest']
-pre =
-post =
-tags = ['functional', 'pyzfs']

From 112b0131b9896fa62a1022e93db8e0ff1cdd79f9 Mon Sep 17 00:00:00 2001
From: Andrew Walker <andrew.walker@truenas.com>
Date: Mon, 25 May 2026 18:02:08 -0500
Subject: [PATCH 070/129] zpl_xattr: stop heap-allocating prefixed xattr names

The six __zpl_xattr_{user,trusted,security}_{get,set} entry points
built their prefixed name via kmem_asprintf("%s%s", prefix, name)
and freed it with kmem_strfree on the way out.

The Linux xattr API caps the full prefix+name length at
XATTR_NAME_MAX (255), the same bound fs/xattr.c's syscall handlers
rely on with their stack-resident struct xattr_name, and so do
the same in our xattr handlers.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@truenas.com>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Andrew Walker <andrew.walker@truenas.com>
Closes #18570
---
 module/os/linux/zfs/zpl_xattr.c | 92 ++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 36 deletions(-)

diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c
index d93282db815..68050c870de 100644
--- a/module/os/linux/zfs/zpl_xattr.c
+++ b/module/os/linux/zfs/zpl_xattr.c
@@ -701,6 +701,24 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value,
  * ZFS allows extended user attributes to be disabled administratively
  * by setting the 'xattr=off' property on the dataset.
  */
+
+/*
+ * Concatenate prefix + name into a NUL-terminated stack buffer.
+ * Linux fs/xattr.c (import_xattr_name) caps the full xattr name at
+ * XATTR_NAME_MAX before any handler runs, so XATTR_NAME_MAX + 1
+ * bytes always fit.
+ */
+static inline void
+zpl_xattr_join_name(char *buf, size_t buflen, const char *prefix,
+    size_t prefix_len, const char *name, size_t name_len)
+{
+	ASSERT3U(prefix_len + name_len + 1, <=, buflen);
+
+	memcpy(buf, prefix, prefix_len);
+	memcpy(buf + prefix_len, name, name_len);
+	buf[prefix_len + name_len] = '\0';
+}
+
 static int
 __zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
     const char *name, size_t name_len)
@@ -726,9 +744,13 @@ __zpl_xattr_user_get(struct inode *ip, const char *name,
 	 * try again without the namespace prefix for compatibility with
 	 * other platforms.
 	 */
-	char *xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+	char xattr_name[XATTR_NAME_MAX + 1];
+
+	zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
+	    XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN,
+	    name, strlen(name));
+
 	error = zpl_xattr_get(ip, xattr_name, value, size);
-	kmem_strfree(xattr_name);
 	if (error == -ENODATA)
 		error = zpl_xattr_get(ip, name, value, size);
 
@@ -758,8 +780,13 @@ __zpl_xattr_user_set(zidmap_t *user_ns,
 	 *   XATTR_CREATE: fail if xattr already exists
 	 *   XATTR_REPLACE: fail if xattr does not exist
 	 */
-	char *prefixed_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+	char prefixed_name[XATTR_NAME_MAX + 1];
 	const char *clear_name, *set_name;
+
+	zpl_xattr_join_name(prefixed_name, sizeof (prefixed_name),
+	    XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN,
+	    name, strlen(name));
+
 	if (zfs_xattr_compat) {
 		clear_name = prefixed_name;
 		set_name = name;
@@ -776,7 +803,7 @@ __zpl_xattr_user_set(zidmap_t *user_ns,
 	 * because it already exists.  Stop here.
 	 */
 	if (error == -EEXIST)
-		goto out;
+		return (error);
 	/*
 	 * If XATTR_REPLACE was specified and we succeeded to clear
 	 * an xattr, we don't need to replace anything when setting
@@ -788,10 +815,7 @@ __zpl_xattr_user_set(zidmap_t *user_ns,
 	/*
 	 * Set the new value with the configured name format.
 	 */
-	error = zpl_xattr_set(ip, set_name, value, size, flags);
-out:
-	kmem_strfree(prefixed_name);
-	return (error);
+	return (zpl_xattr_set(ip, set_name, value, size, flags));
 }
 ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
 
@@ -824,17 +848,16 @@ static int
 __zpl_xattr_trusted_get(struct inode *ip, const char *name,
     void *value, size_t size)
 {
-	char *xattr_name;
-	int error;
+	char xattr_name[XATTR_NAME_MAX + 1];
 
 	if (!capable(CAP_SYS_ADMIN))
 		return (-EACCES);
-	/* xattr_resolve_name will do this for us if this is defined */
-	xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
-	error = zpl_xattr_get(ip, xattr_name, value, size);
-	kmem_strfree(xattr_name);
 
-	return (error);
+	zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
+	    XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN,
+	    name, strlen(name));
+
+	return (zpl_xattr_get(ip, xattr_name, value, size));
 }
 ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
 
@@ -844,17 +867,16 @@ __zpl_xattr_trusted_set(zidmap_t *user_ns,
     const void *value, size_t size, int flags)
 {
 	(void) user_ns;
-	char *xattr_name;
-	int error;
+	char xattr_name[XATTR_NAME_MAX + 1];
 
 	if (!capable(CAP_SYS_ADMIN))
 		return (-EACCES);
-	/* xattr_resolve_name will do this for us if this is defined */
-	xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
-	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
-	kmem_strfree(xattr_name);
 
-	return (error);
+	zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
+	    XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN,
+	    name, strlen(name));
+
+	return (zpl_xattr_set(ip, xattr_name, value, size, flags));
 }
 ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
 
@@ -889,14 +911,13 @@ static int
 __zpl_xattr_security_get(struct inode *ip, const char *name,
     void *value, size_t size)
 {
-	char *xattr_name;
-	int error;
-	/* xattr_resolve_name will do this for us if this is defined */
-	xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
-	error = zpl_xattr_get(ip, xattr_name, value, size);
-	kmem_strfree(xattr_name);
+	char xattr_name[XATTR_NAME_MAX + 1];
 
-	return (error);
+	zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
+	    XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN,
+	    name, strlen(name));
+
+	return (zpl_xattr_get(ip, xattr_name, value, size));
 }
 ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
 
@@ -906,14 +927,13 @@ __zpl_xattr_security_set(zidmap_t *user_ns,
     const void *value, size_t size, int flags)
 {
 	(void) user_ns;
-	char *xattr_name;
-	int error;
-	/* xattr_resolve_name will do this for us if this is defined */
-	xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
-	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
-	kmem_strfree(xattr_name);
+	char xattr_name[XATTR_NAME_MAX + 1];
 
-	return (error);
+	zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
+	    XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN,
+	    name, strlen(name));
+
+	return (zpl_xattr_set(ip, xattr_name, value, size, flags));
 }
 ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
 

From af0228bb54092f61fb5f118aeb0d313538742f88 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 26 May 2026 01:05:07 +0200
Subject: [PATCH 071/129] ZTS: zpool_expand_005_pos: correct variable name in
 expandsize check

The check referenced $zpool_expandsize, which is not defined in this
test; the variable assigned two lines above is $expandsize. A "-"
value returned by zpool reopen therefore did not trigger the
intended log_fail, and the failure surfaced only at the later
post-online-e size check with a less specific message.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18580
---
 .../functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh
index 530661a686a..92c97aacd84 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh
@@ -82,7 +82,7 @@ log_must zpool reopen $TESTPOOL1
 
 typeset expandsize=$(get_pool_prop expandsize $TESTPOOL1)
 log_note "pool expandsize: $expandsize"
-if [[ "$zpool_expandsize" = "-" ]]; then
+if [[ "$expandsize" = "-" ]]; then
 	log_fail "pool $TESTPOOL1 did not detect any " \
 	    "expandsize after reopen"
 fi

From 88656cc95b698c7089364767bb0b5ba0a7d6e625 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 26 May 2026 01:07:02 +0200
Subject: [PATCH 072/129] ZTS/alloc_class: move file_in_special_vdev to
 alloc_class.kshlib

Move the function into the shared library.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18584
---
 .../functional/alloc_class/alloc_class.kshlib | 34 ++++++++++++++++++
 .../alloc_class/alloc_class_012_pos.ksh       | 35 -------------------
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib b/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
index be281c62404..649a6ec601c 100644
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
@@ -67,3 +67,37 @@ function display_status
 
 	return $ret
 }
+
+#
+# Verify the file identified by the input <inode> is written on a special vdev
+# According to the pool layout used in this test vdev_id 3 and 4 are special
+#
+function file_in_special_vdev # <dataset> <inode>
+{
+	typeset dataset="$1"
+	typeset inum="$2"
+	typeset num_normal=$(echo $ZPOOL_DISKS | wc -w)
+	num_normal=${num_normal##* }
+
+	zdb -dddddd $dataset $inum | awk -v d=$num_normal '{
+# find DVAs from string "offset level dva" only for L0 (data) blocks
+if (match($0,"L0 [0-9]+")) {
+   dvas[0]=$3
+   dvas[1]=$4
+   dvas[2]=$5
+   for (i = 0; i < 3; ++i) {
+      if (match(dvas[i],"([^:]+):.*")) {
+         dva = substr(dvas[i], RSTART, RLENGTH);
+         # parse DVA from string "vdev:offset:asize"
+         if (split(dva,arr,":") != 3) {
+            print "Error parsing DVA: <" dva ">";
+            exit 1;
+         }
+         # verify vdev is "special"
+         if (arr[1] < d) {
+            exit 1;
+         }
+      }
+   }
+}}'
+}
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh
index 743a717b2e8..3d463b37611 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh
@@ -25,41 +25,6 @@
 
 verify_runnable "global"
 
-#
-# Verify the file identified by the input <inode> is written on a special vdev
-# According to the pool layout used in this test vdev_id 3 and 4 are special
-# XXX: move this function to libtest.shlib once we get "Vdev Properties"
-#
-function file_in_special_vdev # <dataset> <inode>
-{
-	typeset dataset="$1"
-	typeset inum="$2"
-	typeset num_normal=$(echo $ZPOOL_DISKS | wc -w)
-	num_normal=${num_normal##* }
-
-	zdb -dddddd $dataset $inum | awk -v d=$num_normal '{
-# find DVAs from string "offset level dva" only for L0 (data) blocks
-if (match($0,"L0 [0-9]+")) {
-   dvas[0]=$3
-   dvas[1]=$4
-   dvas[2]=$5
-   for (i = 0; i < 3; ++i) {
-      if (match(dvas[i],"([^:]+):.*")) {
-         dva = substr(dvas[i], RSTART, RLENGTH);
-         # parse DVA from string "vdev:offset:asize"
-         if (split(dva,arr,":") != 3) {
-            print "Error parsing DVA: <" dva ">";
-            exit 1;
-         }
-         # verify vdev is "special"
-         if (arr[1] < d) {
-            exit 1;
-         }
-      }
-   }
-}}'
-}
-
 #
 # Check that device removal works for special class vdevs
 #

From efdc755761ca4e96057405aac315cd9153db26cf Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 26 May 2026 21:10:44 +0200
Subject: [PATCH 073/129] ZTS/zinject: cover label, object, delay, panic and
 verify effect (#18579)

* ZTS/zinject: cover label, object, delay, panic and verify effect

Cover the device, label, object, delay and panic injection modes:
every valid value is accepted and unknown values are rejected. A
final pass confirms that registered injections execute by watching
the inject counter advance after triggering the desired injected
error.

Signed-off-by: Christos Longros <chris.longros@gmail.com>

* ZTS/zinject: add author copyright

Signed-off-by: Christos Longros <chris.longros@gmail.com>

---------

Signed-off-by: Christos Longros <chris.longros@gmail.com>
---
 .../cli_root/zinject/zinject_args.ksh         | 140 +++++++++++++++++-
 1 file changed, 138 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
index 93c320da6fd..f08e4fb6472 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
@@ -23,11 +23,15 @@
 
 #
 # Copyright (c) 2024, Klara Inc.
+# Copyright (c) 2026, Christos Longros <chris.longros@gmail.com>
 #
 
 #
-# TODO: this only checks that the set of valid device fault types. It should
-#       check all the other options, and that they work, and everything really.
+# This covers device, label, object, delay, panic injection modes:
+# every valid value is accepted and unknown values are rejected.
+# A final pass also confirms that a registered injection actually
+# executes by watching the inject counter advance after triggering
+# the desired injected error.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -39,6 +43,7 @@ log_assert "Check zinject parameters."
 log_onexit cleanup
 
 DISK1=${DISKS%% *}
+TESTFILE=/$TESTPOOL/zinject_args.file
 
 function cleanup
 {
@@ -56,8 +61,139 @@ function test_device_fault
 	zinject -c all
 }
 
+function test_device_fault_neg
+{
+	log_mustnot eval "zinject -d $DISK1 -e bogus -T read $TESTPOOL"
+	log_mustnot eval "zinject -d $DISK1 -e io -T bogus $TESTPOOL"
+	zinject -c all
+}
+
+function test_label_fault
+{
+	typeset -a labels=("nvlist" "pad1" "pad2" "uber")
+	for l in ${labels[@]}; do
+		log_must eval \
+		    "zinject -d $DISK1 -e io -L $l $TESTPOOL"
+	done
+	zinject -c all
+}
+
+function test_label_fault_neg
+{
+	log_mustnot eval "zinject -d $DISK1 -e io -L bogus $TESTPOOL"
+	zinject -c all
+}
+
+function test_object_fault
+{
+	log_must dd if=/dev/urandom of=$TESTFILE bs=128k count=1
+	log_must zpool sync $TESTPOOL
+
+	for t in data dnode; do
+		log_must eval "zinject -t $t -e io -f 0.001 $TESTFILE"
+	done
+	zinject -c all
+
+	for t in mos mosdir metaslab config bpobj spacemap errlog; do
+		log_must eval "zinject -t $t -e io -f 0.001 $TESTPOOL"
+	done
+	zinject -c all
+}
+
+function test_object_fault_neg
+{
+	log_mustnot eval "zinject -t bogus -e io $TESTPOOL"
+	log_mustnot eval "zinject -t data -e bogus $TESTFILE"
+	# -t data only accepts checksum or io as the error type.
+	log_mustnot eval "zinject -t data -e nxio $TESTFILE"
+	zinject -c all
+}
+
+function test_delay_fault
+{
+	log_must eval "zinject -d $DISK1 -D 10:1 $TESTPOOL"
+	log_must eval "zinject -d $DISK1 -D 25:2 -T read $TESTPOOL"
+	log_must eval "zinject -d $DISK1 -D 25:2 -T write $TESTPOOL"
+	zinject -c all
+}
+
+function test_delay_fault_neg
+{
+	log_mustnot eval "zinject -d $DISK1 -D 0:1 $TESTPOOL"
+	log_mustnot eval "zinject -d $DISK1 -D 10 $TESTPOOL"
+	log_mustnot eval "zinject -d $DISK1 -D foo $TESTPOOL"
+	zinject -c all
+}
+
+function test_panic_fault
+{
+	# An unmatched function tag so zio_handle_panic_injection() never fires.
+	log_must eval "zinject -p zfs_test_no_such_fn $TESTPOOL"
+	log_must eval "zinject -p zfs_test_no_such_fn $TESTPOOL 1"
+	zinject | grep -q zfs_test_no_such_fn || \
+	    log_fail "panic function was not registered"
+	zinject -c all
+}
+
+function test_panic_fault_neg
+{
+	log_mustnot eval "zinject -p f -d $DISK1 $TESTPOOL"
+	log_mustnot eval "zinject -p f -t data $TESTFILE"
+	log_mustnot eval "zinject -p f -f 50 $TESTPOOL"
+	zinject -c all
+}
+
+# Each registered device/delay/data handler row ends with "match inject".
+function inject_count
+{
+	zinject | awk '/^ *[0-9]/{print $NF}' | head -n 1
+}
+
+function verify_injection
+{
+	typeset cnt
+
+	log_must zfs set primarycache=none $TESTPOOL
+	log_must dd if=/dev/urandom of=$TESTFILE bs=128k count=1
+	log_must zpool sync $TESTPOOL
+
+	log_must eval "zinject -d $DISK1 -e io -T read -f 100 $TESTPOOL"
+	dd if=$TESTFILE of=/dev/null bs=128k count=1 >/dev/null 2>&1 || true
+	cnt=$(inject_count)
+	[[ -n $cnt && $cnt -gt 0 ]] || \
+	    log_fail "device-fault injection did not execute (inject=$cnt)"
+	zinject -c all
+
+	log_must eval "zinject -t data -e checksum -f 100 $TESTFILE"
+	dd if=$TESTFILE of=/dev/null bs=128k count=1 >/dev/null 2>&1 || true
+	cnt=$(inject_count)
+	[[ -n $cnt && $cnt -gt 0 ]] || \
+	    log_fail "object-fault injection did not execute (inject=$cnt)"
+	zinject -c all
+
+	log_must eval "zinject -d $DISK1 -D 5:1 -T write $TESTPOOL"
+	log_must dd if=/dev/urandom of=$TESTFILE bs=128k count=1
+	log_must zpool sync $TESTPOOL
+	cnt=$(inject_count)
+	[[ -n $cnt && $cnt -gt 0 ]] || \
+	    log_fail "delay injection did not execute (inject=$cnt)"
+	zinject -c all
+
+	log_must zfs inherit primarycache $TESTPOOL
+}
+
 default_mirror_setup_noexit $DISKS
 
 test_device_fault
+test_device_fault_neg
+test_label_fault
+test_label_fault_neg
+test_object_fault
+test_object_fault_neg
+test_delay_fault
+test_delay_fault_neg
+test_panic_fault
+test_panic_fault_neg
+verify_injection
 
 log_pass "zinject parameters work as expected."

From 2e5b9bd1168c4018e0aefcfc4e60295032ddbde3 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 23 May 2026 21:25:56 +1000
Subject: [PATCH 074/129] unit: zero coverage counters before coverage run

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18586
---
 tests/unit/Makefile.am | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index 43b7ddb84ca..7e29779c3c0 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -46,7 +46,9 @@ PHONY += unit unit-coverage
 _unit_run_%: %D%/%
 	@echo "  UNITTEST $<" ; $<
 
-_unit_coverage_%: _unit_run_%
+_unit_coverage_%: %D%/%
+	@${LCOV} --quiet --zerocounters --directory $(top_srcdir) >/dev/null
+	@echo "  UNITTEST $<" ; $<
 	@${LCOV} --quiet --capture  \
 		--test-name $(subst _unit_coverage_, , $@) \
 		--directory $(top_srcdir) \

From 605ae841022cb64392e8c499c86c837b18828aee Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 23 May 2026 21:33:26 +1000
Subject: [PATCH 075/129] unit: TOPT make arg to pass test options through to
 the test binary

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18586
---
 tests/unit/Makefile.am | 4 ++--
 tests/unit/README.md   | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index 7e29779c3c0..e23037a3e1b 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -44,11 +44,11 @@ nodist_%C%_test_zap_SOURCES = \
 PHONY += unit unit-coverage
 
 _unit_run_%: %D%/%
-	@echo "  UNITTEST $<" ; $<
+	@echo "  UNITTEST $<" ; $< $(TOPT)
 
 _unit_coverage_%: %D%/%
 	@${LCOV} --quiet --zerocounters --directory $(top_srcdir) >/dev/null
-	@echo "  UNITTEST $<" ; $<
+	@echo "  UNITTEST $<" ; $< $(TOPT)
 	@${LCOV} --quiet --capture  \
 		--test-name $(subst _unit_coverage_, , $@) \
 		--directory $(top_srcdir) \
diff --git a/tests/unit/README.md b/tests/unit/README.md
index 464c8f1c731..a7096067529 100644
--- a/tests/unit/README.md
+++ b/tests/unit/README.md
@@ -45,6 +45,10 @@ Running test suite with seed 0x18e131ac...
 ...
 ```
 
+The test framework provides various options for controlling how the tests are
+run. Add the `--help` switch for more info. If using the make rule, options can
+be passed via the `TOPT=` param.
+
 ### Building just for tests
 
 Recommended “minimum” build for just the unit tests, with additional debug to

From 6ecaa194b61cd4a26a2802a0dbb4461f91ebf715 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 15:03:26 +1000
Subject: [PATCH 076/129] zap: expose _by_dnode() variants of remaining core
 functions

Exposes the remaining internal implementation functions:
- zap_update_by_dnode()
- zap_length_by_dnode()
- zap_get_stats_by_dnode()

And creates zap_contains_by_dnode(), followng the same structure as the
other functions.

Together, these complete the "core" ZAP _by_dnode() API for the test
suite to use.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18586
---
 include/sys/zap.h |  6 ++++++
 module/zfs/zap.c  | 22 +++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index 69f021034ba..7e89ad7d3de 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -259,6 +259,7 @@ int zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  * exist, 0 if it does. This is like zap_lookup(), but may be more efficient.
  */
 int zap_contains(objset_t *os, uint64_t zapobj, const char *name);
+int zap_contains_by_dnode(dnode_t *dn, const char *name);
 
 /*
  * Prefetch the blocks within the ZAP where the given key is stored. The
@@ -309,6 +310,8 @@ int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  */
 int zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_by_dnode(dnode_t *dn, const char *name, int integer_size,
+    uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /* Update by uint64_t[] key. */
 int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
@@ -327,6 +330,8 @@ int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
  */
 int zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_by_dnode(dnode_t *dn, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers);
 
 /* Attribute length by uint64_t[] key. */
 int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
@@ -585,6 +590,7 @@ typedef struct zap_stats {
  * know what you're doing.
  */
 int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs);
+int zap_get_stats_by_dnode(dnode_t *dn, zap_stats_t *zs);
 
 /* ZAP subsystem setup/teardown */
 void zap_init(void);
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 3de985c37cb..caed9c67794 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -354,15 +354,27 @@ zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
 /* zap_contains */
 
 int
-zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+zap_contains_by_dnode(dnode_t *dn, const char *name)
 {
-	int err = zap_lookup_norm(os, zapobj, name, 0,
+	int err = zap_lookup_norm_by_dnode(dn, name, 0,
 	    0, NULL, 0, NULL, 0, NULL);
 	if (err == EOVERFLOW || err == EINVAL)
 		err = 0; /* found, but skipped reading the value */
 	return (err);
 }
 
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_contains_by_dnode(dn, name);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 /* zap_prefetch */
 
 static int
@@ -550,7 +562,7 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 
 /* zap_update */
 
-static int
+int
 zap_update_by_dnode(dnode_t *dn, const char *name, int integer_size,
     uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
@@ -647,7 +659,7 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 
 /* zap_length */
 
-static int
+int
 zap_length_by_dnode(dnode_t *dn, const char *name, uint64_t *integer_size,
     uint64_t *num_integers)
 {
@@ -1208,7 +1220,7 @@ zap_cursor_serialize(zap_cursor_t *zc)
 
 /* zap_get_stats */
 
-static int
+int
 zap_get_stats_by_dnode(dnode_t *dn, zap_stats_t *zs)
 {
 	zap_t *zap;

From 1294d442039307314cc131f9f3d24f70af111d57 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Tue, 5 May 2026 19:31:06 +1000
Subject: [PATCH 077/129] test_zap: cover all core ZAP operations

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18586
---
 tests/unit/test_zap.c | 299 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 298 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index d8ec4288ec1..5d53e49b2c7 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -231,6 +231,290 @@ test_zap_basic(const MunitParameter params[], void *data)
 
 /* ========== */
 
+/*
+ * "Core" ZAP API tests. Covers the most basic functionality upon which which
+ * everything else is built.
+ *
+ * Note that to avoid microzap upgrade here, we only short keys and
+ * single-uint64 values.
+ */
+
+/* zap_add: add new items. */
+static MunitResult
+test_zap_add(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* A key added can be found by that name. */
+	uint64_t va = 1, var = 0;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var));
+	unit_eq(var, 1);
+
+	/* Another key added can be found by that name. */
+	uint64_t vb = 2, vbr = 0;
+	unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &vb, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "b", sizeof (uint64_t), 1, &vbr));
+	unit_eq(vbr, 2);
+
+	/* The first key is still findable with the right value. */
+	var = 0;
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var));
+	unit_eq(var, 1);
+
+	/* Adding the key again fails. */
+	unit_err(zap_add_by_dnode(dn, "a",
+	    sizeof (uint64_t), 1, &va, tx), EEXIST);
+
+	/* Adding the key with a different value still fails. */
+	va = 2;
+	unit_err(zap_add_by_dnode(dn, "a",
+	    sizeof (uint64_t), 1, &va, tx), EEXIST);
+
+	/* And is still findable with the original value. */
+	var = 0;
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var));
+	unit_eq(var, 1);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_update: add new or replace existing items. */
+static MunitResult
+test_zap_update(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* Update on a non-existent key inserts it. */
+	uint64_t va = 1, var = 0;
+	unit_ok(zap_update_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var));
+	unit_eq(var, 1);
+
+	/* Update on an existing key replaces it without error. */
+	va = 2;
+	unit_ok(zap_update_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var));
+	unit_eq(var, 2);
+
+	/* Count should still be 1 (no duplicate was created). */
+	uint64_t count = 0;
+	unit_ok(zap_count_by_dnode(dn, &count));
+	unit_eq(count, 1);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_remove: remove existing items. */
+static MunitResult
+test_zap_remove(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* Removing a non-existing key fails. */
+	unit_err(zap_remove_by_dnode(dn, "a", tx), ENOENT);
+
+	/* Adding two keys. */
+	uint64_t va = 1, vb = 2;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx));
+	unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &vb, tx));
+
+	/* Remove an existing key succeeds. */
+	unit_ok(zap_remove_by_dnode(dn, "a", tx));
+
+	/* After removing, looking up removed key fails. */
+	uint64_t var = 0;
+	unit_err(
+	    zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var), ENOENT);
+
+	/* Looking up the other key succeeds, and has the correct value. */
+	uint64_t vbr = 0;
+	unit_ok(zap_lookup_by_dnode(dn, "b", sizeof (uint64_t), 1, &vbr));
+	unit_eq(vbr, 2);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_count: number of entries, typically without lookup or traversal. */
+static MunitResult
+test_zap_count(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* A new ZAP has zero entries. */
+	uint64_t count = 0;
+	unit_ok(zap_count_by_dnode(dn, &count));
+	unit_eq(count, 0);
+
+	/* Adding two keys bumps the count to 2. */
+	uint64_t v = 1;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_count_by_dnode(dn, &count));
+	unit_eq(count, 2);
+
+	/* Removing a key reduces the count. */
+	unit_ok(zap_remove_by_dnode(dn, "a", tx));
+	unit_ok(zap_count_by_dnode(dn, &count));
+	unit_eq(count, 1);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_contains: existence check without reading the value. */
+static MunitResult
+test_zap_contains(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	uint64_t v = 1;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_contains_by_dnode(dn, "a"));
+	unit_err(zap_contains_by_dnode(dn, "b"), ENOENT);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_length: item metadata without reading the value. */
+static MunitResult
+test_zap_length(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* uint64: integer_size=8, num_integers=1. */
+	uint64_t v = 42;
+	unit_ok(zap_add_by_dnode(dn, "u64",
+	    sizeof (uint64_t), 1, &v, tx));
+
+	uint64_t isz = 0, nint = 0;
+	unit_ok(zap_length_by_dnode(dn, "u64", &isz, &nint));
+	unit_eq(isz, 8);
+	unit_eq(nint, 1);
+
+	/* Missing key returns ENOENT. */
+	unit_err(zap_length_by_dnode(dn, "nope", &isz, &nint), ENOENT);
+
+	/* Either output pointer may be NULL. */
+	isz = 0; nint = 0;
+	unit_ok(zap_length_by_dnode(dn, "u64", NULL, &nint));
+	unit_ok(zap_length_by_dnode(dn, "u64", &isz, NULL));
+	unit_eq(isz, 8);
+	unit_eq(nint, 1);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
+/*
+ * Separate stats tests for each ZAP type, since they are about internals and
+ * so can and will produce different results.
+ */
+
+static MunitResult
+test_microzap_stats(const MunitParameter params[], void *data)
+{
+	(void) params; (void) data;
+
+	dnode_t *dn = mock_zap_create_microzap();
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	zap_stats_t zs;
+	uint64_t v = 1;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_get_stats_by_dnode(dn, &zs));
+
+	/* We added two entries. */
+	unit_eq(zs.zs_num_entries, 2);
+
+	/* MicroZAP is always a single block. */
+	unit_eq(zs.zs_num_blocks, 1);
+
+	/* Blocksize matches what we passed to mock_dnode_create(). */
+	unit_eq(zs.zs_blocksize, 512);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_microzap(dn));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+static MunitResult
+test_fatzap_stats(const MunitParameter params[], void *data)
+{
+	(void) params; (void) data;
+
+	dnode_t *dn = mock_zap_create_fatzap();
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	zap_stats_t zs;
+	uint64_t v = 1;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_get_stats_by_dnode(dn, &zs));
+
+	/* We added two entries. */
+	unit_eq(zs.zs_num_entries, 2);
+
+	/* One header block, one leaf block. */
+	unit_eq(zs.zs_num_blocks, 2);
+
+	/* FatZAP block size set by tuneable. */
+	unit_eq(zs.zs_blocksize, 1 << fzap_default_block_shift);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_fatzap(dn));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
 /* Test suite definition and boilerplate. */
 
 #define	UNIT_PARAM_ZAP_TYPES(p)	\
@@ -241,11 +525,24 @@ static const MunitParameterEnum zap_type_params[] = {
 	{ 0 },
 };
 
+#define	UNIT_TEST_ZAP_TYPES(name, func)	\
+	UNIT_TEST(name, func, zap_type_params)
+
 static const MunitTest zap_tests[] = {
 	UNIT_TEST("mock_microzap_sanity",	test_mock_microzap_sanity),
 	UNIT_TEST("mock_fatzap_sanity",		test_mock_fatzap_sanity),
 
-	UNIT_TEST("zap_basic",	test_zap_basic,	zap_type_params),
+	UNIT_TEST_ZAP_TYPES("zap_basic",	test_zap_basic),
+
+	UNIT_TEST_ZAP_TYPES("zap_add",		test_zap_add),
+	UNIT_TEST_ZAP_TYPES("zap_update",	test_zap_update),
+	UNIT_TEST_ZAP_TYPES("zap_remove",	test_zap_remove),
+	UNIT_TEST_ZAP_TYPES("zap_count",	test_zap_count),
+	UNIT_TEST_ZAP_TYPES("zap_contains",	test_zap_contains),
+	UNIT_TEST_ZAP_TYPES("zap_length",	test_zap_length),
+
+	UNIT_TEST("microzap_stats",		test_microzap_stats),
+	UNIT_TEST("fatzap_stats",		test_fatzap_stats),
 
 	{ 0 },
 };

From 6c08f5db51626e0b57a010b4f887a1db8ac56e2e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 27 May 2026 05:49:53 +1000
Subject: [PATCH 078/129] config: detect the right way to get pthreads

To get at userspace threads, we use a mix of -pthread and -lpthread to
compiler and/or linker. That's fine enough for the platforms we target
but its not exactly right (eg on Linux -pthread defines _REENTRANT, when
-lpthread does not), and won't work properly some other platforms that
we might end up on someday (eg illumos).

There's also a danger if we link together two compilations units, one
compiled with -pthread, one not, as calls between them may not properly
manage thread state.

Here we switch to use the AX_PTHREAD macro to detect the correct set of
flags for CFLAGS and LIBS, and add them to the default compilation
flags for all units.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18588
---
 cmd/Makefile.am                 |   1 -
 cmd/zed/Makefile.am             |   1 -
 config/Rules.am                 |   2 +
 config/ax_pthread.m4            | 523 ++++++++++++++++++++++++++++++++
 configure.ac                    |   1 +
 etc/Makefile.am                 |   2 -
 lib/libspl/Makefile.am          |   4 -
 lib/libzfs/Makefile.am          |   4 +-
 lib/libzfs_core/Makefile.am     |   4 +-
 lib/libzpool/Makefile.am        |   4 +-
 tests/unit/Makefile.am          |   2 -
 tests/zfs-tests/cmd/Makefile.am |   5 -
 12 files changed, 529 insertions(+), 24 deletions(-)
 create mode 100644 config/ax_pthread.m4

diff --git a/cmd/Makefile.am b/cmd/Makefile.am
index 6f8d0c4b1db..6e54be7466a 100644
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@@ -54,7 +54,6 @@ ztest_LDADD = \
 	libnvpair.la
 
 ztest_LDADD += -lm
-ztest_LDFLAGS = -pthread
 
 
 include $(srcdir)/%D%/raidz_test/Makefile.am
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am
index 0166d072356..712917401a0 100644
--- a/cmd/zed/Makefile.am
+++ b/cmd/zed/Makefile.am
@@ -41,6 +41,5 @@ zed_LDADD = \
 	libnvpair.la
 
 zed_LDADD += -lrt $(LIBATOMIC_LIBS) $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
-zed_LDFLAGS = -pthread
 
 dist_noinst_DATA += %D%/agents/README.md
diff --git a/config/Rules.am b/config/Rules.am
index 5117929cac5..c4a9641f58f 100644
--- a/config/Rules.am
+++ b/config/Rules.am
@@ -23,6 +23,7 @@ AM_CFLAGS += $(IMPLICIT_FALLTHROUGH)
 AM_CFLAGS += $(DEBUG_CFLAGS)
 AM_CFLAGS += $(ASAN_CFLAGS)
 AM_CFLAGS += $(UBSAN_CFLAGS)
+AM_CFLAGS += $(PTHREAD_CFLAGS)
 AM_CFLAGS += $(CODE_COVERAGE_CFLAGS)
 AM_CFLAGS += $(NO_FORMAT_ZERO_LENGTH)
 AM_CFLAGS += $(NO_FORMAT_TRUNCATION)
@@ -57,6 +58,7 @@ endif
 AM_LDFLAGS  = $(DEBUG_LDFLAGS)
 AM_LDFLAGS += $(ASAN_LDFLAGS)
 AM_LDFLAGS += $(UBSAN_LDFLAGS)
+AM_LDFLAGS += $(PTHREAD_LIBS)
 
 if BUILD_FREEBSD
 AM_LDFLAGS += -fstack-protector-strong
diff --git a/config/ax_pthread.m4 b/config/ax_pthread.m4
new file mode 100644
index 00000000000..daea8c5987e
--- /dev/null
+++ b/config/ax_pthread.m4
@@ -0,0 +1,523 @@
+# SPDX-License-Identifier: GPL-3.0-or-later WITH Autoconf-exception-macro
+# ===========================================================================
+#        https://www.gnu.org/software/autoconf-archive/ax_pthread.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro figures out how to build C programs using POSIX threads. It
+#   sets the PTHREAD_LIBS output variable to the threads library and linker
+#   flags, and the PTHREAD_CFLAGS output variable to any special C compiler
+#   flags that are needed. (The user can also force certain compiler
+#   flags/libs to be tested by setting these environment variables.)
+#
+#   Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is
+#   needed for multi-threaded programs (defaults to the value of CC
+#   respectively CXX otherwise). (This is necessary on e.g. AIX to use the
+#   special cc_r/CC_r compiler alias.)
+#
+#   NOTE: You are assumed to not only compile your program with these flags,
+#   but also to link with them as well. For example, you might link with
+#   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+#   $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+#
+#   If you are only building threaded programs, you may wish to use these
+#   variables in your default LIBS, CFLAGS, and CC:
+#
+#     LIBS="$PTHREAD_LIBS $LIBS"
+#     CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+#     CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS"
+#     CC="$PTHREAD_CC"
+#     CXX="$PTHREAD_CXX"
+#
+#   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
+#   has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
+#   that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#
+#   Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
+#   PTHREAD_PRIO_INHERIT symbol is defined when compiling with
+#   PTHREAD_CFLAGS.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a threads library
+#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
+#   is not found. If ACTION-IF-FOUND is not specified, the default action
+#   will define HAVE_PTHREAD.
+#
+#   Please let the authors know if this macro fails on any platform, or if
+#   you have any other suggestions or comments. This macro was based on work
+#   by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help
+#   from M. Frigo), as well as ac_pthread and hb_pthread macros posted by
+#   Alejandro Forero Cuervo to the autoconf macro repository. We are also
+#   grateful for the helpful feedback of numerous users.
+#
+#   Updated for Autoconf 2.68 by Daniel Richard G.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#   Copyright (c) 2011 Daniel Richard G. <skunk@iSKUNK.ORG>
+#   Copyright (c) 2019 Marc Stevens <marc.stevens@cwi.nl>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 31
+
+AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
+AC_DEFUN([AX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+AC_REQUIRE([AC_PROG_CC])
+AC_REQUIRE([AC_PROG_SED])
+AC_LANG_PUSH([C])
+ax_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on Tru64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
+        ax_pthread_save_CC="$CC"
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"])
+        AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"])
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS])
+        AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes])
+        AC_MSG_RESULT([$ax_pthread_ok])
+        if test "x$ax_pthread_ok" = "xno"; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        CC="$ax_pthread_save_CC"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try. Items with a "," contain both
+# C compiler flags (before ",") and linker flags (after ","). Other items
+# starting with a "-" are C compiler flags, and remaining items are
+# library names, except for "none" which indicates that we try without
+# any flags at all, and "pthread-config" which is a program returning
+# the flags for the Pth emulation library.
+
+ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64
+#           (Note: HP C rejects this with "bad form for `-t' option")
+# -pthreads: Solaris/gcc (Note: HP C also rejects)
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads and
+#      -D_REENTRANT too), HP C (must be checked before -lpthread, which
+#      is present but should not be used directly; and before -mthreads,
+#      because the compiler interprets this as "-mt" + "-hreads")
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case $host_os in
+
+        freebsd*)
+
+        # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+        # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+
+        ax_pthread_flags="-kthread lthread $ax_pthread_flags"
+        ;;
+
+        hpux*)
+
+        # From the cc(1) man page: "[-mt] Sets various -D flags to enable
+        # multi-threading and also sets -lpthread."
+
+        ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags"
+        ;;
+
+        openedition*)
+
+        # IBM z/OS requires a feature-test macro to be defined in order to
+        # enable POSIX threads at all, so give the user a hint if this is
+        # not set. (We don't define these ourselves, as they can affect
+        # other portions of the system API in unpredictable ways.)
+
+        AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING],
+            [
+#            if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS)
+             AX_PTHREAD_ZOS_MISSING
+#            endif
+            ],
+            [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])])
+        ;;
+
+        solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed. (N.B.: The stubs are missing
+        # pthread_cleanup_push, or rather a function called by this macro,
+        # so we could check for that, but who knows whether they'll stub
+        # that too in a future libc.)  So we'll check first for the
+        # standard Solaris way of linking pthreads (-mt -lpthread).
+
+        ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags"
+        ;;
+esac
+
+# Are we compiling with Clang?
+
+AC_CACHE_CHECK([whether $CC is Clang],
+    [ax_cv_PTHREAD_CLANG],
+    [ax_cv_PTHREAD_CLANG=no
+     # Note that Autoconf sets GCC=yes for Clang as well as GCC
+     if test "x$GCC" = "xyes"; then
+        AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG],
+            [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */
+#            if defined(__clang__) && defined(__llvm__)
+             AX_PTHREAD_CC_IS_CLANG
+#            endif
+            ],
+            [ax_cv_PTHREAD_CLANG=yes])
+     fi
+    ])
+ax_pthread_clang="$ax_cv_PTHREAD_CLANG"
+
+
+# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC)
+
+# Note that for GCC and Clang -pthread generally implies -lpthread,
+# except when -nostdlib is passed.
+# This is problematic using libtool to build C++ shared libraries with pthread:
+# [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460
+# [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333
+# [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555
+# To solve this, first try -pthread together with -lpthread for GCC
+
+AS_IF([test "x$GCC" = "xyes"],
+      [ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"])
+
+# Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first
+
+AS_IF([test "x$ax_pthread_clang" = "xyes"],
+      [ax_pthread_flags="-pthread,-lpthread -pthread"])
+
+
+# The presence of a feature test macro requesting re-entrant function
+# definitions is, on some systems, a strong hint that pthreads support is
+# correctly enabled
+
+case $host_os in
+        darwin* | hpux* | linux* | osf* | solaris*)
+        ax_pthread_check_macro="_REENTRANT"
+        ;;
+
+        aix*)
+        ax_pthread_check_macro="_THREAD_SAFE"
+        ;;
+
+        *)
+        ax_pthread_check_macro="--"
+        ;;
+esac
+AS_IF([test "x$ax_pthread_check_macro" = "x--"],
+      [ax_pthread_check_cond=0],
+      [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"])
+
+
+if test "x$ax_pthread_ok" = "xno"; then
+for ax_pthread_try_flag in $ax_pthread_flags; do
+
+        case $ax_pthread_try_flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                *,*)
+                PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"`
+                PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"`
+                AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag])
+                PTHREAD_CFLAGS="$ax_pthread_try_flag"
+                ;;
+
+                pthread-config)
+                AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no])
+                AS_IF([test "x$ax_pthread_config" = "xno"], [continue])
+                PTHREAD_CFLAGS="`pthread-config --cflags`"
+                PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+                ;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag])
+                PTHREAD_LIBS="-l$ax_pthread_try_flag"
+                ;;
+        esac
+
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+
+        AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>
+#                       if $ax_pthread_check_cond
+#                        error "$ax_pthread_check_macro must be defined"
+#                       endif
+                        static void *some_global = NULL;
+                        static void routine(void *a)
+                          {
+                             /* To avoid any unused-parameter or
+                                unused-but-set-parameter warning.  */
+                             some_global = a;
+                          }
+                        static void *start_routine(void *a) { return a; }],
+                       [pthread_t th; pthread_attr_t attr;
+                        pthread_create(&th, 0, start_routine, 0);
+                        pthread_join(th, 0);
+                        pthread_attr_init(&attr);
+                        pthread_cleanup_push(routine, 0);
+                        pthread_cleanup_pop(0) /* ; */])],
+            [ax_pthread_ok=yes],
+            [])
+
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
+
+        AC_MSG_RESULT([$ax_pthread_ok])
+        AS_IF([test "x$ax_pthread_ok" = "xyes"], [break])
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+
+# Clang needs special handling, because older versions handle the -pthread
+# option in a rather... idiosyncratic way
+
+if test "x$ax_pthread_clang" = "xyes"; then
+
+        # Clang takes -pthread; it has never supported any other flag
+
+        # (Note 1: This will need to be revisited if a system that Clang
+        # supports has POSIX threads in a separate library.  This tends not
+        # to be the way of modern systems, but it's conceivable.)
+
+        # (Note 2: On some systems, notably Darwin, -pthread is not needed
+        # to get POSIX threads support; the API is always present and
+        # active.  We could reasonably leave PTHREAD_CFLAGS empty.  But
+        # -pthread does define _REENTRANT, and while the Darwin headers
+        # ignore this macro, third-party headers might not.)
+
+        # However, older versions of Clang make a point of warning the user
+        # that, in an invocation where only linking and no compilation is
+        # taking place, the -pthread option has no effect ("argument unused
+        # during compilation").  They expect -pthread to be passed in only
+        # when source code is being compiled.
+        #
+        # Problem is, this is at odds with the way Automake and most other
+        # C build frameworks function, which is that the same flags used in
+        # compilation (CFLAGS) are also used in linking.  Many systems
+        # supported by AX_PTHREAD require exactly this for POSIX threads
+        # support, and in fact it is often not straightforward to specify a
+        # flag that is used only in the compilation phase and not in
+        # linking.  Such a scenario is extremely rare in practice.
+        #
+        # Even though use of the -pthread flag in linking would only print
+        # a warning, this can be a nuisance for well-run software projects
+        # that build with -Werror.  So if the active version of Clang has
+        # this misfeature, we search for an option to squash it.
+
+        AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread],
+            [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG],
+            [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown
+             # Create an alternate version of $ac_link that compiles and
+             # links in two steps (.c -> .o, .o -> exe) instead of one
+             # (.c -> exe), because the warning occurs only in the second
+             # step
+             ax_pthread_save_ac_link="$ac_link"
+             ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g'
+             ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"`
+             ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)"
+             ax_pthread_save_CFLAGS="$CFLAGS"
+             for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do
+                AS_IF([test "x$ax_pthread_try" = "xunknown"], [break])
+                CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS"
+                ac_link="$ax_pthread_save_ac_link"
+                AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
+                    [ac_link="$ax_pthread_2step_ac_link"
+                     AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
+                         [break])
+                    ])
+             done
+             ac_link="$ax_pthread_save_ac_link"
+             CFLAGS="$ax_pthread_save_CFLAGS"
+             AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no])
+             ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try"
+            ])
+
+        case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in
+                no | unknown) ;;
+                *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;;
+        esac
+
+fi # $ax_pthread_clang = yes
+
+
+
+# Various other checks:
+if test "x$ax_pthread_ok" = "xyes"; then
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+        AC_CACHE_CHECK([for joinable pthread attribute],
+            [ax_cv_PTHREAD_JOINABLE_ATTR],
+            [ax_cv_PTHREAD_JOINABLE_ATTR=unknown
+             for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+                 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>],
+                                                 [int attr = $ax_pthread_attr; return attr /* ; */])],
+                                [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break],
+                                [])
+             done
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \
+               test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \
+               test "x$ax_pthread_joinable_attr_defined" != "xyes"],
+              [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE],
+                                  [$ax_cv_PTHREAD_JOINABLE_ATTR],
+                                  [Define to necessary symbol if this constant
+                                   uses a non-standard name on your system.])
+               ax_pthread_joinable_attr_defined=yes
+              ])
+
+        AC_CACHE_CHECK([whether more special flags are required for pthreads],
+            [ax_cv_PTHREAD_SPECIAL_FLAGS],
+            [ax_cv_PTHREAD_SPECIAL_FLAGS=no
+             case $host_os in
+             solaris*)
+             ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS"
+             ;;
+             esac
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \
+               test "x$ax_pthread_special_flags_added" != "xyes"],
+              [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS"
+               ax_pthread_special_flags_added=yes])
+
+        AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
+            [ax_cv_PTHREAD_PRIO_INHERIT],
+            [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],
+                                             [[int i = PTHREAD_PRIO_INHERIT;
+                                               return i;]])],
+                            [ax_cv_PTHREAD_PRIO_INHERIT=yes],
+                            [ax_cv_PTHREAD_PRIO_INHERIT=no])
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \
+               test "x$ax_pthread_prio_inherit_defined" != "xyes"],
+              [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])
+               ax_pthread_prio_inherit_defined=yes
+              ])
+
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
+
+        # More AIX lossage: compile with *_r variant
+        if test "x$GCC" != "xyes"; then
+            case $host_os in
+                aix*)
+                AS_CASE(["x/$CC"],
+                    [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
+                    [#handle absolute path differently from PATH based program lookup
+                     AS_CASE(["x$CC"],
+                         [x/*],
+                         [
+			   AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])
+			   AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])])
+			 ],
+                         [
+			   AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])
+			   AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])])
+			 ]
+                     )
+                    ])
+                ;;
+            esac
+        fi
+fi
+
+test -n "$PTHREAD_CC" || PTHREAD_CC="$CC"
+test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX"
+
+AC_SUBST([PTHREAD_LIBS])
+AC_SUBST([PTHREAD_CFLAGS])
+AC_SUBST([PTHREAD_CC])
+AC_SUBST([PTHREAD_CXX])
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test "x$ax_pthread_ok" = "xyes"; then
+        ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1])
+        :
+else
+        ax_pthread_ok=no
+        $2
+fi
+AC_LANG_POP
+])dnl AX_PTHREAD
diff --git a/configure.ac b/configure.ac
index 3757b5e2cac..74e4ab3bdf8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,6 +54,7 @@ AC_PROG_LN_S
 PKG_PROG_PKG_CONFIG
 AM_PROG_AS
 AM_PROG_CC_C_O
+AX_PTHREAD
 AX_CODE_COVERAGE
 _AM_PROG_TAR(pax)
 
diff --git a/etc/Makefile.am b/etc/Makefile.am
index 58b3cf563b6..2bea12ae514 100644
--- a/etc/Makefile.am
+++ b/etc/Makefile.am
@@ -88,8 +88,6 @@ systemdgenerator_PROGRAMS = \
 %C%_systemd_system_generators_zfs_mount_generator_LDADD = \
 	libzfs.la
 
-%C%_systemd_system_generators_zfs_mount_generator_LDFLAGS = -pthread
-
 CPPCHECKTARGETS += $(systemdgenerator_PROGRAMS)
 endif
 
diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index 8b50c65c0e6..4b097297816 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -63,7 +63,3 @@ libspl_la_LIBADD = \
 libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
 
 libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
-
-if BUILD_FREEBSD
-libspl_assert_la_LIBADD += -lpthread
-endif
diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am
index 450c501556e..deae3534749 100644
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@@ -76,7 +76,7 @@ libzfs_la_LIBADD = \
 
 libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
 
-libzfs_la_LDFLAGS = -pthread
+libzfs_la_LDFLAGS = -version-info 7:0:0
 
 if !ASAN_ENABLED
 libzfs_la_LDFLAGS += -Wl,-z,defs
@@ -86,8 +86,6 @@ if BUILD_FREEBSD
 libzfs_la_LIBADD += -lutil -lgeom
 endif
 
-libzfs_la_LDFLAGS += -version-info 7:0:0
-
 pkgconfig_DATA += %D%/libzfs.pc
 
 dist_noinst_DATA += %D%/libzfs.abi %D%/libzfs.suppr
diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am
index ec7aa95aa02..751deeeb228 100644
--- a/lib/libzfs_core/Makefile.am
+++ b/lib/libzfs_core/Makefile.am
@@ -33,7 +33,7 @@ libzfs_core_la_LIBADD = \
 
 libzfs_core_la_LIBADD += $(LTLIBINTL)
 
-libzfs_core_la_LDFLAGS = -pthread
+libzfs_core_la_LDFLAGS = -version-info 3:0:0
 
 if !ASAN_ENABLED
 libzfs_core_la_LDFLAGS += -Wl,-z,defs
@@ -43,8 +43,6 @@ if BUILD_FREEBSD
 libzfs_core_la_LIBADD += -lutil -lgeom
 endif
 
-libzfs_core_la_LDFLAGS += -version-info 3:0:0
-
 pkgconfig_DATA += %D%/libzfs_core.pc
 
 dist_noinst_DATA += %D%/libzfs_core.abi %D%/libzfs_core.suppr
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 05105407d52..22c7ceaa1ba 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -214,7 +214,7 @@ libzpool_la_LIBADD = \
 
 libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -lm
 
-libzpool_la_LDFLAGS = -pthread
+libzpool_la_LDFLAGS = -version-info 7:0:0
 
 if !ASAN_ENABLED
 libzpool_la_LDFLAGS += -Wl,-z,defs
@@ -224,8 +224,6 @@ if BUILD_FREEBSD
 libzpool_la_LIBADD += -lgeom
 endif
 
-libzpool_la_LDFLAGS += -version-info 7:0:0
-
 if TARGET_CPU_POWERPC
 module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.$(OBJEXT) : CFLAGS += -maltivec
 module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec
diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index e23037a3e1b..cb5bfc10013 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -37,8 +37,6 @@ nodist_%C%_test_zap_SOURCES = \
 	libbtree.la \
 	libunit.la
 
-%C%_test_zap_LDFLAGS = -pthread
-
 
 # test run and coverage targets below
 PHONY += unit unit-coverage
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index 9f92310985e..3275c1358aa 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -35,7 +35,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/crypto_test
 %C%_crypto_test_LDADD = libzpool.la
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/clone_after_trunc
-%C%_clone_after_trunc_LDADD = -lpthread
 
 if WANT_DEVNAME2DEVID
 scripts_zfs_tests_bin_PROGRAMS += %D%/devname2devid
@@ -71,7 +70,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/libzfs_mnttab_cache_check
 	libzfs.la
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/manipulate_user_buffer
-%C%_manipulate_user_buffer_LDADD = -lpthread
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree
 %C%_mkfile_LDADD = $(LTLIBINTL)
@@ -80,7 +78,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree
 scripts_zfs_tests_bin_PROGRAMS += \
 	%D%/mmap_exec %D%/mmap_ftruncate %D%/mmap_seek \
 	%D%/mmap_sync %D%/mmapwrite %D%/readmmap %D%/mmap_write_sync
-%C%_mmapwrite_LDADD = -lpthread
 
 if WANT_MMAP_LIBAIO
 scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_libaio
@@ -95,7 +92,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/nvlist_to_lua
 	libnvpair.la
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/rm_lnkcnt_zero_file
-%C%_rm_lnkcnt_zero_file_LDADD = -lpthread
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/send_doall
 %C%_send_doall_LDADD = \
@@ -107,7 +103,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/stride_dd
 %C%_stride_dd_LDADD = -lrt
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/threadsappend
-%C%_threadsappend_LDADD = -lpthread
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/ereports
 %C%_ereports_LDADD = \

From 8bfac28f1592998cb21b56113c16b4313a7c4e00 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Wed, 27 May 2026 00:45:38 +0200
Subject: [PATCH 079/129] .github: update workflows README

Describe the current zfs-qemu pipeline, ci_type selection, supported
guests, and the code-checking and other auxiliary workflows.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18590
---
 .github/workflows/README.md | 129 +++++++++++++++++++++++-------------
 1 file changed, 82 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index eef47dae3dc..78774aac52f 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -1,61 +1,96 @@
 
-## The testings are done this way
+## CI overview
+
+The main test pipeline is `zfs-qemu.yml`. Code checking and other
+workflows run independently alongside it.
 
 ```mermaid
 flowchart TB
-subgraph CleanUp and Summary
-  CleanUp+Summary
+subgraph Functional testing
+  Setup[test-config: pick ci_type + OS matrix]
+  Setup --> almalinux
+  Setup --> centos[centos-stream]
+  Setup --> debian
+  Setup --> fedora
+  Setup --> ubuntu
+  Setup --> freebsd
+  almalinux --> Cleanup[cleanup + summary]
+  centos --> Cleanup
+  debian --> Cleanup
+  fedora --> Cleanup
+  ubuntu --> Cleanup
+  freebsd --> Cleanup
 end
 
-subgraph Functional Testings
-  sanity-checks-20.04
-  zloop-checks-20.04
-  functional-testing-20.04-->Part1-20.04
-  functional-testing-20.04-->Part2-20.04
-  functional-testing-20.04-->Part3-20.04
-  functional-testing-20.04-->Part4-20.04
-  functional-testing-22.04-->Part1-22.04
-  functional-testing-22.04-->Part2-22.04
-  functional-testing-22.04-->Part3-22.04
-  functional-testing-22.04-->Part4-22.04
-  sanity-checks-22.04
-  zloop-checks-22.04
-end
-
-subgraph Code Checking + Building
-  Build-Ubuntu-20.04
+subgraph Code checking
+  checkstyle.yaml
   codeql.yml
-  checkstyle.yml
-  Build-Ubuntu-22.04
+  smatch.yml
 end
 
-  Build-Ubuntu-20.04-->sanity-checks-20.04
-  Build-Ubuntu-20.04-->zloop-checks-20.04
-  Build-Ubuntu-20.04-->functional-testing-20.04
-  Build-Ubuntu-22.04-->sanity-checks-22.04
-  Build-Ubuntu-22.04-->zloop-checks-22.04
-  Build-Ubuntu-22.04-->functional-testing-22.04
-
-  sanity-checks-20.04-->CleanUp+Summary
-  Part1-20.04-->CleanUp+Summary
-  Part2-20.04-->CleanUp+Summary
-  Part3-20.04-->CleanUp+Summary
-  Part4-20.04-->CleanUp+Summary
-  Part1-22.04-->CleanUp+Summary
-  Part2-22.04-->CleanUp+Summary
-  Part3-22.04-->CleanUp+Summary
-  Part4-22.04-->CleanUp+Summary
-  sanity-checks-22.04-->CleanUp+Summary
+subgraph Other workflows
+  zfs-arm.yml
+  zloop.yml
+  labels.yml
+end
 ```
 
+Every `qemu-vm` matrix entry runs on a fixed `ubuntu-24.04` host.
+The steps inside one entry are:
 
-1) build zfs modules for Ubuntu 20.04 and 22.04 (~15m)
-2) 2x zloop test (~10m) + 2x sanity test (~25m)
-3) 4x functional testings in parts 1..4 (each ~1h)
-4) cleanup and create summary
-   - content of summary depends on the results of the steps
+1) set up QEMU and boot the guest (~2-4m)
+2) install build dependencies in the guest (~2-4m)
+3) build zfs modules in the guest (~8-12m)
+4) run functional tests (~2-4h)
+5) package and upload per-OS test logs (~10s)
 
-When everything runs fine, the full run should be done in
-about 2 hours.
+A per-OS entry takes about 3 to 4 hours. Once all entries finish, the
+`cleanup` job aggregates the results into a summary.
 
-The codeql.yml and checkstyle.yml are not part in this circle.
+### `ci_type` selection
+
+`test-config` runs `.github/workflows/scripts/generate-ci-type.py` against
+the PR's changed files and picks one of:
+
+| `ci_type` | OS matrix                                  |
+|-----------|--------------------------------------------|
+| `docs`    | empty (documentation-only PRs)             |
+| `quick`   | 6 Linux + 1 FreeBSD                        |
+| `linux`   | all supported Linux distros                |
+| `freebsd` | all supported FreeBSD versions             |
+| default   | cross-platform sample                      |
+
+Pushes to `openzfs/zfs` skip the matrix entirely; only PRs (and pushes to
+forks) build.
+
+Authors can force a specific ci_type by adding `ZFS-CI-Type: <type>` to
+the most recent commit message. The `ZTS_OS_OVERRIDE` repository variable
+can also alter the selection. The `workflow_dispatch` trigger accepts
+`fedora_kernel_ver` (Fedora-only run with a chosen kernel) and
+`specific_os` (pin the matrix to one OS).
+
+### Supported guests
+
+Auto-selected:
+
+- Linux: almalinux 8/9/10, centos-stream 9/10, debian 11/12/13,
+  fedora 43/44, ubuntu 22/24/26
+- FreeBSD: 14.4-RELEASE/STABLE, 15.0-RELEASE, 15.1-STABLE, 16.0-CURRENT
+
+Available via `specific_os` or `ZTS_OS_OVERRIDE`:
+
+- archlinux, tumbleweed
+
+### Code checking
+
+- `checkstyle.yaml`: source-style checks
+- `codeql.yml`: CodeQL analysis
+- `smatch.yml`: smatch analysis
+
+### Other workflows
+
+- `zfs-arm.yml`: ARM build on `ubuntu-24.04-arm`
+- `zloop.yml`: host-side zloop
+- `labels.yml`: maintains PR status labels
+- `zfs-qemu-packages.yml`: manually dispatched, builds release RPMs or
+  tests RPM installation from the ZFS yum repo

From 6303a582421d485aab052bd9df4204c5b0bcab0f Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Wed, 27 May 2026 02:15:42 +0200
Subject: [PATCH 080/129] spa: expose max_missing_tvds_cachefile and _scan on
 Linux

Register the two siblings of zfs_max_missing_tvds via
ZFS_MODULE_PARAM in spa.c

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18589
---
 man/man4/zfs.4                    | 13 +++++++++++++
 module/os/freebsd/zfs/sysctl_os.c | 12 ------------
 module/zfs/spa.c                  |  6 ++++++
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 11b6c622f8e..657070de02a 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -918,6 +918,19 @@ but that was not proven to be useful.
 Number of missing top-level vdevs which will be allowed during
 pool import (only in read-only mode).
 .
+.It Sy zfs_max_missing_tvds_cachefile Ns = Ns Sy 2 Pq u64
+Number of missing top-level vdevs tolerated when importing a pool
+from a cachefile, before the trusted config is read from the MOS.
+A cachefile can fall out of sync with the on-disk config after a
+device removal that did not rewrite the cachefile, so the default
+of 2 still lets the import reach a copy of the MOS.
+.
+.It Sy zfs_max_missing_tvds_scan Ns = Ns Sy 0 Pq u64
+Number of missing top-level vdevs tolerated when importing a pool
+by scanning device paths, before the trusted config is read from
+the MOS.
+Defaults to 0 because a scan should detect every present device.
+.
 .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64
 Maximum size in bytes allowed to be passed as
 .Sy zc_nvlist_src_size
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index 934055da88d..eb7ada12e14 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -381,18 +381,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval,
 	"Configuration cache file write, retry after failure, interval"
 	" (seconds)");
 
-extern uint64_t zfs_max_missing_tvds_cachefile;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile,
-	CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0,
-	"Allow importing pools with missing top-level vdevs in cache file");
-
-extern uint64_t zfs_max_missing_tvds_scan;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan,
-	CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0,
-	"Allow importing pools with missing top-level vdevs during scan");
-
 /* spa_misc.c */
 
 extern int zfs_flags;
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index eafd4b17620..7c466bf2d22 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -11811,6 +11811,12 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
 	"Allow importing pool with up to this number of missing top-level "
 	"vdevs (in read-only mode)");
 
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds_cachefile, U64, ZMOD_RW,
+	"Allow importing pools with missing top-level vdevs in cache file");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds_scan, U64, ZMOD_RW,
+	"Allow importing pools with missing top-level vdevs during scan");
+
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
 	ZMOD_RW, "Set the livelist condense zthr to pause");
 

From dc585960e0c70142b189f215f8da3aca5218def1 Mon Sep 17 00:00:00 2001
From: tiehexue <tiehexue@hotmail.com>
Date: Thu, 28 May 2026 01:07:55 +0800
Subject: [PATCH 081/129] Linux 5.6 compat: fix fs_parse API mismatch

Added m4 macro to check fs_parse API signature and wrappers.  Before
5.6, fs_parse() took a struct fs_parameter_description which wraps
the parameter specs with name and enum pointers. From 5.6, the
description struct was removed and fs_parse() accepts the
fs_parameter_spec directly.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: tiehexue <tiehexue@hotmail.com>
Closes #18585
---
 config/kernel-fs-parse.m4       | 34 +++++++++++++++++++++++++++++++++
 config/kernel.m4                |  2 ++
 module/os/linux/zfs/zpl_super.c | 25 ++++++++++++++++++++++--
 3 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 config/kernel-fs-parse.m4

diff --git a/config/kernel-fs-parse.m4 b/config/kernel-fs-parse.m4
new file mode 100644
index 00000000000..7a6ffa77238
--- /dev/null
+++ b/config/kernel-fs-parse.m4
@@ -0,0 +1,34 @@
+dnl # SPDX-License-Identifier: CDDL-1.0
+dnl #
+dnl # 5.6 API change
+dnl # Before 5.6, fs_parse() took a struct fs_parameter_description
+dnl # which wraps the parameter specs with name and enum pointers. From 5.6,
+dnl # the description struct was removed and fs_parse() accepts the
+dnl # fs_parameter_spec directly.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_FS_PARSE], [
+	ZFS_LINUX_TEST_SRC([fs_parse], [
+		#include <linux/fs_context.h>
+		#include <linux/fs_parser.h>
+	],[
+		static const struct fs_parameter_spec specs[] = {
+			{}
+		};
+		int test __attribute__ ((unused));
+		struct fs_context *fc __attribute__ ((unused)) = NULL;
+		struct fs_parameter param __attribute__ ((unused));
+		struct fs_parse_result result __attribute__ ((unused));
+		test = fs_parse(fc, specs, &param, &result);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_FS_PARSE], [
+	AC_MSG_CHECKING([whether fs_parse() takes fs_parameter_spec directly])
+	ZFS_LINUX_TEST_RESULT([fs_parse], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_FS_PARSE_TAKES_SPEC, 1,
+		    [fs_parse() takes fs_parameter_spec directly])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 7225591b86d..55f40767567 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -78,6 +78,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SRC_SECURITY_INODE
 	ZFS_AC_KERNEL_SRC_FS_CONTEXT
+	ZFS_AC_KERNEL_SRC_FS_PARSE
 	ZFS_AC_KERNEL_SRC_SB_DYING
 	ZFS_AC_KERNEL_SRC_SET_NLINK
 	ZFS_AC_KERNEL_SRC_SGET
@@ -201,6 +202,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SECURITY_INODE
 	ZFS_AC_KERNEL_FS_CONTEXT
+	ZFS_AC_KERNEL_FS_PARSE
 	ZFS_AC_KERNEL_SB_DYING
 	ZFS_AC_KERNEL_SET_NLINK
 	ZFS_AC_KERNEL_SGET
diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index e1fa0f8e88e..d7194e4f1f7 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -613,13 +613,34 @@ static const struct fs_parameter_spec zpl_param_spec[] = {
 	{}
 };
 
+/*
+ * Before 5.6, fs_parse() took a struct fs_parameter_description
+ * which wraps the parameter specs with name and enum pointers. From 5.6,
+ * the description struct was removed and fs_parse() accepts the
+ * fs_parameter_spec directly.
+ */
+static int
+zpl_fs_parse(struct fs_context *fc, struct fs_parameter *param,
+	struct fs_parse_result *result)
+{
+#ifdef HAVE_FS_PARSE_TAKES_SPEC
+	return (fs_parse(fc, zpl_param_spec, param, result));
+#else
+	static const struct fs_parameter_description zpl_param_desc = {
+		.name = "zfs",
+		.specs = zpl_param_spec,
+	};
+	return (fs_parse(fc, &zpl_param_desc, param, result));
+#endif
+}
+
 static int
 zpl_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	vfs_t *vfs = fc->fs_private;
 
 	struct fs_parse_result result;
-	int opt = fs_parse(fc, zpl_param_spec, param, &result);
+	int opt = zpl_fs_parse(fc, param, &result);
 	if (opt == -ENOPARAM) {
 		/*
 		 * Convert unknowns to warnings, to work around the whole
@@ -803,7 +824,7 @@ zpl_parse_monolithic(struct fs_context *fc, void *data)
 
 		/* Check if this is one of our options. */
 		struct fs_parse_result result;
-		int opt = fs_parse(fc, zpl_param_spec, &param, &result);
+		int opt = zpl_fs_parse(fc, &param, &result);
 		if (opt >= 0) {
 			/*
 			 * We already know this one of our options, so a

From e30ab5fa4f99091482adf9c9904945c439c1e0aa Mon Sep 17 00:00:00 2001
From: Mark Johnston <markjdb@gmail.com>
Date: Thu, 28 May 2026 12:02:48 -0400
Subject: [PATCH 082/129] FreeBSD: Make it possible to build openzfs.ko with
 sanitizers

Add make options which let one respectively compile the kernel modules
with the address sanitizer, memory sanitizer, and undefined behaviour
sanitizer enabled.  This makes it much easier to run the ZTS with those
sanitizers enabled.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Chris Longros <chris.longros@gmail.com>
Signed-off-by: Mark Johnston <markj@FreeBSD.org>
Closes #18596
---
 module/Makefile.bsd | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 96c3f3b2418..a0ddbeb9ae6 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -65,6 +65,12 @@ CFLAGS+= -DZFS_DEBUG -g
 CFLAGS += -DNDEBUG
 .endif
 
+.for _SAN in KASAN KMSAN KUBSAN
+.if defined(WITH_${_SAN}) && ${WITH_${_SAN}} == "true"
+KERN_OPTS_EXTRA+= ${_SAN}
+.endif
+.endfor
+
 .if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
 CFLAGS+=	 -fprofile-arcs -ftest-coverage
 .endif

From 3250b4393ec15e2bb3abcdd09fcee792b1a2dc7b Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Fri, 29 May 2026 00:06:46 +0200
Subject: [PATCH 083/129] CI: Update checkstyle checkout action to v6

The checkstyle workflow was the only one still pinned to
actions/checkout@v4; the other workflows already use v6.
Bump it to match.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18600
---
 .github/workflows/checkstyle.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/checkstyle.yaml b/.github/workflows/checkstyle.yaml
index ddcc2b8581f..ce1e1fb8a46 100644
--- a/.github/workflows/checkstyle.yaml
+++ b/.github/workflows/checkstyle.yaml
@@ -12,7 +12,7 @@ jobs:
   checkstyle:
     runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
       with:
         ref: ${{ github.event.pull_request.head.sha }}
     - name: Install dependencies

From 472ddca116935e924dc10baa3dc75ef22d15ce65 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Thu, 28 May 2026 18:14:26 -0400
Subject: [PATCH 084/129] zed: Prefer spares with matching rotational and size

Before this change zed tried to activate spares just in order they
are stored in configuration, which is quite arbitrary.  To make
the result more optimal, sort the spares by their rotational status
and size, so that the most fitting ones have better chances.

To make it more visible, export the rotational status as a vdev
property.  While at it, minimally fix vdev properties reading for
spare and L2ARC vdevs, having no ZAPs.

To keep the rotational status for spare activation purposes when
failed device is already gone, save it into the vdev config.  The
same is for spare vdevs asize.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18597
---
 cmd/zed/agents/zfs_retire.c                   | 82 ++++++++++++++++-
 include/sys/fs/zfs.h                          |  2 +
 lib/libzfs/libzfs.abi                         |  3 +-
 man/man7/vdevprops.7                          |  2 +
 module/zcommon/zpool_prop.c                   |  3 +
 module/zfs/spa.c                              | 16 +++-
 module/zfs/vdev.c                             | 48 ++++++++--
 module/zfs/vdev_label.c                       |  8 ++
 tests/runfiles/common.run                     |  2 +-
 tests/runfiles/linux.run                      |  3 +-
 tests/zfs-tests/tests/Makefile.am             |  2 +
 .../cli_root/zpool_get/vdev_get.cfg           |  1 +
 .../fault/auto_spare_rotational.ksh           | 84 +++++++++++++++++
 .../vdev_zaps/vdev_zaps_008_pos.ksh           | 90 +++++++++++++++++++
 14 files changed, 328 insertions(+), 18 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh
 create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh

diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index 8aabf6d3bf7..0c6c30f2e86 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -350,6 +350,47 @@ is_draid_fdomain_failure(fmd_hdl_t *hdl, libzfs_handle_t *zhdl,
 	return (res);
 }
 
+/*
+ * Returns B_TRUE if spare 'a' should be tried before spare 'b' when
+ * replacing a failed vdev with the given characteristics.
+ *
+ * Ordering criteria (most to least significant):
+ *  1. Matching rotational is preferred over mismatching.
+ *  2. Large enough is preferred over (potentially?) too small.
+ *  3. Smaller size is preferred over bigger (best fit).
+ */
+static boolean_t
+spare_is_preferred(nvlist_t *a, nvlist_t *b, boolean_t have_rotational,
+    uint64_t vdev_rotational, uint64_t vdev_size)
+{
+	uint64_t a_rotational = 0, b_rotational = 0;
+	uint64_t a_size = 0, b_size = 0;
+
+	if (have_rotational) {
+		(void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_VDEV_ROTATIONAL,
+		    &a_rotational);
+		(void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_VDEV_ROTATIONAL,
+		    &b_rotational);
+		if ((a_rotational == vdev_rotational) !=
+		    (b_rotational == vdev_rotational))
+			return (a_rotational == vdev_rotational);
+	}
+
+	vdev_stat_t *vs;
+	unsigned int c;
+	if (nvlist_lookup_uint64_array(a, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0)
+		a_size = vs->vs_rsize;
+	if (nvlist_lookup_uint64_array(b, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0)
+		b_size = vs->vs_rsize;
+	boolean_t a_ok = (a_size >= vdev_size);
+	boolean_t b_ok = (b_size >= vdev_size);
+	if (a_ok != b_ok)
+		return (a_ok);
+	return (a_size < b_size);
+}
+
 /*
  * Given a vdev, attempt to replace it with every known spare until one
  * succeeds or we run out of devices to try.
@@ -364,6 +405,10 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 	char *dev_name;
 	zprop_source_t source;
 	int ashift;
+	uint64_t vdev_rotational = 0, vdev_size = 0;
+	boolean_t have_vdev_rotational;
+	vdev_stat_t *vs;
+	unsigned int c;
 
 	config = zpool_get_config(zhp, NULL);
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
@@ -377,6 +422,34 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 	    &spares, &nspares) != 0)
 		return (B_FALSE);
 
+	/*
+	 * Collect the failed vdev's parameters for optimal replacement.
+	 */
+	have_vdev_rotational = (nvlist_lookup_uint64(vdev,
+	    ZPOOL_CONFIG_VDEV_ROTATIONAL, &vdev_rotational) == 0);
+	if (nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0)
+		vdev_size = vs->vs_rsize;
+
+	/*
+	 * Build a sorted index array over the spares, so that better
+	 * candicates are tried first.
+	 */
+	uint_t order[nspares];
+	for (s = 0; s < nspares; s++)
+		order[s] = s;
+	for (s = 1; s < nspares; s++) {
+		uint_t key = order[s];
+		int j = (int)s - 1;
+		while (j >= 0 && spare_is_preferred(spares[key],
+		    spares[order[j]], have_vdev_rotational, vdev_rotational,
+		    vdev_size)) {
+			order[j + 1] = order[j];
+			j--;
+		}
+		order[j + 1] = key;
+	}
+
 	/*
 	 * lookup "ashift" pool property, we may need it for the replacement
 	 */
@@ -394,25 +467,26 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 	 * replace it.
 	 */
 	for (s = 0; s < nspares; s++) {
+		nvlist_t *spare = spares[order[s]];
 		boolean_t rebuild = B_FALSE;
 		const char *spare_name, *type;
 
-		if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
+		if (nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH,
 		    &spare_name) != 0)
 			continue;
 
 		/* prefer sequential resilvering for distributed spares */
-		if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
+		if ((nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE,
 		    &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
 			rebuild = B_TRUE;
 
 		/* if set, add the "ashift" pool property to the spare nvlist */
 		if (source != ZPROP_SRC_DEFAULT)
-			(void) nvlist_add_uint64(spares[s],
+			(void) nvlist_add_uint64(spare,
 			    ZPOOL_CONFIG_ASHIFT, ashift);
 
 		(void) nvlist_add_nvlist_array(replacement,
-		    ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1);
+		    ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spare, 1);
 
 		fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
 		    dev_name, zfs_basename(spare_name));
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 4c4d15f8ce0..8e877166ada 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -478,6 +478,7 @@ typedef enum {
 	VDEV_PROP_FDOMAIN,
 	VDEV_PROP_FGROUP,
 	VDEV_PROP_ALLOC_BIAS,
+	VDEV_PROP_ROTATIONAL,
 	VDEV_NUM_PROPS
 } vdev_prop_t;
 
@@ -931,6 +932,7 @@ typedef struct zpool_load_policy {
 #define	ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH	"vdev_enc_sysfs_path"
 
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
+#define	ZPOOL_CONFIG_VDEV_ROTATIONAL	"rotational"
 #define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
 #define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
 #define	ZPOOL_CONFIG_SPARES		"spares"
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index be74babbcba..3f88f2fb83d 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -6416,7 +6416,8 @@
       <enumerator name='VDEV_PROP_FDOMAIN' value='56'/>
       <enumerator name='VDEV_PROP_FGROUP' value='57'/>
       <enumerator name='VDEV_PROP_ALLOC_BIAS' value='58'/>
-      <enumerator name='VDEV_NUM_PROPS' value='59'/>
+      <enumerator name='VDEV_PROP_ROTATIONAL' value='59'/>
+      <enumerator name='VDEV_NUM_PROPS' value='60'/>
     </enum-decl>
     <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
     <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index da38acafeee..b52c6d4b023 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -142,6 +142,8 @@ See
 .Xr zpool-attach 8 .
 .It Sy trim_support
 Indicates if a leaf device supports trim operations.
+.It Sy rotational
+Indicates whether the device backing this vdev uses rotating media.
 .El
 .Pp
 The following native properties can be used to change the behavior of a vdev.
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index ccd9f3854f5..09f5c88d8fb 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -574,6 +574,9 @@ vdev_prop_init(void)
 	    VDEV_BIAS_NONE, PROP_DEFAULT, ZFS_TYPE_VDEV,
 	    "none | log | special | dedup", "ALLOC_BIAS",
 	    vdev_alloc_bias_table, sfeatures);
+	zprop_register_index(VDEV_PROP_ROTATIONAL, "rotational", 0,
+	    PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "ROTATIONAL",
+	    boolean_table, sfeatures);
 
 	/* hidden properties */
 	zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 7c466bf2d22..ec93ce97433 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -8333,12 +8333,20 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
-	 * log, dedup and special vdevs should not be replaced by spares.
+	 * Spares can't replace logs
 	 */
-	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
-	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
+	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+	/*
+	 * For special and dedup vdevs a spare must have matching rotational
+	 * characteristics.  A rotating spare replacing a non-rotating vdev
+	 * would silently degrade pool performance, so we reject the mismatch.
+	 */
+	if (newvd->vdev_isspare &&
+	    oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE &&
+	    newvd->vdev_nonrot != oldvd->vdev_nonrot)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-	}
 
 	/*
 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index e4dc9e97af7..91cd9c6dc84 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -474,8 +474,11 @@ vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
 	uint64_t objid;
 	int err;
 
-	if (vdev_prop_get_objid(vd, &objid) != 0)
-		return (EINVAL);
+	if (vdev_prop_get_objid(vd, &objid) != 0) {
+		/* No ZAP: property was never set, return the default. */
+		*value = vdev_prop_default_numeric(prop);
+		return (ENOENT);
+	}
 
 	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
 	    sizeof (uint64_t), 1, value);
@@ -963,6 +966,20 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
+	/*
+	 * Restore the last-known rotational status for leaf vdevs.  vdev_open()
+	 * will overwrite this with the hardware value when the device is
+	 * accessible; the persisted value acts as a fallback for failed or
+	 * missing devices so that spare selection can still match on device
+	 * type even when the original disk is gone.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf) {
+		uint64_t rotational = 0;
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL,
+		    &rotational) == 0)
+			vd->vdev_nonrot = !rotational;
+	}
+
 	vic = &vd->vdev_indirect_config;
 
 	ASSERT0(vic->vic_mapping_object);
@@ -6446,9 +6463,15 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 
 	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
 
-	if (vdev_prop_get_objid(vd, &objid) != 0)
-		return (SET_ERROR(EINVAL));
-	ASSERT(objid != 0);
+	/*
+	 * A missing ZAP is normal for spare and L2ARC vdevs, which are
+	 * not part of the main vdev tree and never get ZAPs allocated.
+	 * Many properties are sourced directly from vdev_t fields and
+	 * work fine without one; ZAP-backed properties will return their
+	 * default values.  objid is set to 0 when absent and the few
+	 * cases that call zap_lookup directly guard against this below.
+	 */
+	(void) vdev_prop_get_objid(vd, &objid);
 
 	mutex_enter(&spa->spa_props_lock);
 
@@ -6772,8 +6795,13 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			case VDEV_PROP_FAILFAST:
 				src = ZPROP_SRC_LOCAL;
 
-				err = zap_lookup(mos, objid, nvpair_name(elem),
-				    sizeof (uint64_t), 1, &intval);
+				if (objid != 0) {
+					err = zap_lookup(mos, objid,
+					    nvpair_name(elem),
+					    sizeof (uint64_t), 1, &intval);
+				} else {
+					err = ENOENT;
+				}
 				if (err == ENOENT) {
 					if (vd->vdev_ops == &vdev_root_ops)
 						intval =
@@ -6835,6 +6863,10 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 					    ZPROP_SRC_NONE);
 				}
 				continue;
+			case VDEV_PROP_ROTATIONAL:
+				vdev_prop_add_list(outnvl, propname, NULL,
+				    !vd->vdev_nonrot, ZPROP_SRC_NONE);
+				continue;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
@@ -6860,6 +6892,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 				/* FALLTHRU */
 			case VDEV_PROP_USERPROP:
 				/* User Properites */
+				if (objid == 0)
+					continue;
 				src = ZPROP_SRC_LOCAL;
 
 				err = zap_length(mos, objid, nvpair_name(elem),
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index b1371b0349c..b3042980aad 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -493,6 +493,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		    vd->vdev_wholedisk);
 	}
 
+	if (vd->vdev_ops->vdev_op_leaf) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL,
+		    !vd->vdev_nonrot);
+	}
+
 	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
@@ -502,6 +507,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (flags & VDEV_CONFIG_L2CACHE)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 
+	if ((flags & VDEV_CONFIG_SPARE) && vd->vdev_asize != 0)
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize);
+
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 6e62b552a0d..0dda8fdfa36 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -1111,7 +1111,7 @@ tags = ['functional', 'vdev_disk']
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
     'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
-    'vdev_zaps_007_pos']
+    'vdev_zaps_007_pos', 'vdev_zaps_008_pos']
 tags = ['functional', 'vdev_zaps']
 
 [tests/functional/write_dirs]
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 11bda60a9ca..009d984f2b9 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -118,7 +118,8 @@ tags = ['functional', 'fallocate']
 tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
     'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
     'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple',
-    'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault',
+    'auto_spare_ashift', 'auto_spare_rotational', 'auto_spare_shared',
+    'decrypt_fault',
     'decompress_fault', 'fault_limits', 'scrub_after_resilver',
     'suspend_on_probe_errors', 'suspend_resume_single', 'suspend_draid_fgroups',
     'zpool_status_-s']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 98f39253882..c7931ca95e2 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1620,6 +1620,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
+	functional/fault/auto_spare_rotational.ksh \
 	functional/fault/auto_spare_double.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
@@ -2292,6 +2293,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/vdev_zaps/vdev_zaps_005_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_006_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_007_pos.ksh \
+	functional/vdev_zaps/vdev_zaps_008_pos.ksh \
 	functional/write_dirs/cleanup.ksh \
 	functional/write_dirs/setup.ksh \
 	functional/write_dirs/write_dirs_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
index 79992227169..be17821ba1a 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
@@ -66,6 +66,7 @@ typeset -a properties=(
     trim_bytes
     removing
     allocating
+    rotational
     failfast
     checksum_n
     checksum_t
diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh
new file mode 100755
index 00000000000..5378979a8bb
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh
@@ -0,0 +1,84 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/fault/fault.cfg
+
+#
+# DESCRIPTION:
+# ZED prefers the smallest sufficient spare when replacing a faulted
+# special vdev, regardless of spare list order.
+#
+# The 'rotational' property is persisted in the pool config for all leaf
+# vdevs so that spare selection can match device type even after the
+# original disk is gone.  ZED sorts spares preferring matching rotational
+# and, among equally-matching spares, the smallest sufficient one.
+#
+# STRATEGY:
+# 1. Create a pool with a normal mirror, a special mirror, and two file
+#    spares of different sizes.  List the larger spare first so that the
+#    sorted order contradicts the list order.
+# 2. Fault a member of the special mirror; verify ZED activates the
+#    smaller sufficient spare, leaving the larger spare available.
+#
+
+verify_runnable "both"
+
+NORM1="$TEST_BASE_DIR/rotational-norm1"
+NORM2="$TEST_BASE_DIR/rotational-norm2"
+SPEC1="$TEST_BASE_DIR/rotational-spec1"
+SPEC2="$TEST_BASE_DIR/rotational-spec2"
+SPARE_SMALL="$TEST_BASE_DIR/rotational-spare-small"
+SPARE_LARGE="$TEST_BASE_DIR/rotational-spare-large"
+
+LARGE_SIZE=$((MINVDEVSIZE * 2))
+
+function cleanup
+{
+	log_must zinject -c all
+	destroy_pool $TESTPOOL
+	rm -f $NORM1 $NORM2 $SPEC1 $SPEC2 $SPARE_SMALL $SPARE_LARGE
+}
+
+log_assert "ZED selects smallest sufficient spare for a faulted special vdev"
+log_onexit cleanup
+
+zed_events_drain
+
+log_must truncate -s $MINVDEVSIZE $NORM1 $NORM2 $SPEC1 $SPEC2 $SPARE_SMALL
+log_must truncate -s $LARGE_SIZE $SPARE_LARGE
+
+# SPARE_LARGE is listed first so that size-preference sorting is what
+# causes SPARE_SMALL to be selected, not merely list order.
+log_must zpool create -f $TESTPOOL \
+    mirror $NORM1 $NORM2 \
+    special mirror $SPEC1 $SPEC2 \
+    spare $SPARE_LARGE $SPARE_SMALL
+
+log_must zinject -d $SPEC1 -e io -T all -f 100 $TESTPOOL
+log_must zpool scrub $TESTPOOL
+
+log_note "Wait for ZED to auto-spare the special vdev"
+log_must wait_vdev_state $TESTPOOL $SPEC1 "FAULTED" 60
+log_must wait_hotspare_state $TESTPOOL $SPARE_SMALL "INUSE"
+
+# The larger spare must not have been activated.
+log_must wait_hotspare_state $TESTPOOL $SPARE_LARGE "AVAIL"
+
+log_must check_state $TESTPOOL "" "DEGRADED"
+
+log_pass "ZED activated the smallest sufficient spare for the special vdev"
diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh
new file mode 100755
index 00000000000..c5ad282eb8a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh
@@ -0,0 +1,90 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2026, TrueNAS.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that the 'rotational' vdev property is readable on spare and
+# L2ARC vdevs, which have no per-vdev ZAP, and that its value persists
+# across export/import when the spare device is absent.
+#
+# STRATEGY:
+# 1. Create a pool with a mirror, a spare, and an L2ARC device.
+# 2. Verify 'rotational' is readable on leaf, virtual (mirror), spare,
+#    and L2ARC vdevs.
+# 3. Export the pool, remove the spare file, re-import, and verify that
+#    'rotational' still reports the same value for the missing spare,
+#    proving the value comes from the persisted config.
+#
+
+verify_runnable "global"
+
+SPARE="$TEST_BASE_DIR/vz008-spare"
+L2C="$TEST_BASE_DIR/vz008-l2c"
+VDEV1="$TEST_BASE_DIR/vz008-vdev1"
+VDEV2="$TEST_BASE_DIR/vz008-vdev2"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+	rm -f $VDEV1 $VDEV2 $SPARE $L2C
+}
+
+log_assert "'rotational' is readable on ZAP-less vdevs and persists absent"
+log_onexit cleanup
+
+log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 $SPARE $L2C
+
+log_must zpool create -f $TESTPOOL \
+    mirror $VDEV1 $VDEV2 \
+    cache $L2C \
+    spare $SPARE
+
+# Leaf vdev should report rotational.
+NR=$(zpool get -H -o value rotational $TESTPOOL $VDEV1)
+[[ "$NR" == "on" || "$NR" == "off" ]] ||
+    log_fail "leaf $VDEV1: expected on/off, got '$NR'"
+
+# Virtual (mirror) vdev should report rotational.
+MIRROR=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}')
+NR=$(zpool get -H -o value rotational $TESTPOOL "$MIRROR")
+[[ "$NR" == "on" || "$NR" == "off" ]] ||
+    log_fail "mirror: expected on/off, got '$NR'"
+
+# Spare vdev should report rotational even though it has no ZAP.
+NR=$(zpool get -H -o value rotational $TESTPOOL $SPARE)
+[[ "$NR" == "on" || "$NR" == "off" ]] ||
+    log_fail "spare $SPARE: expected on/off, got '$NR'"
+
+# L2ARC vdev should report rotational even though it has no ZAP.
+NR=$(zpool get -H -o value rotational $TESTPOOL $L2C)
+[[ "$NR" == "on" || "$NR" == "off" ]] ||
+    log_fail "L2ARC $L2C: expected on/off, got '$NR'"
+
+# The value must persist across export/import when the spare is absent.
+# Remove the spare file before re-import so that vdev_open() cannot read
+# the hardware value and the only source is the persisted config.
+NR_BEFORE=$(zpool get -H -o value rotational $TESTPOOL $SPARE)
+log_must zpool export $TESTPOOL
+log_must rm -f $SPARE
+log_must zpool import -d $TEST_BASE_DIR $TESTPOOL
+NR_AFTER=$(zpool get -H -o value rotational $TESTPOOL $SPARE)
+[[ "$NR_BEFORE" == "$NR_AFTER" ]] ||
+    log_fail "spare rotational changed across import: $NR_BEFORE -> $NR_AFTER"
+
+log_pass "'rotational' readable on spare/L2ARC vdevs and persists when absent"

From d13663b17cabdaa64f6dff761830fa81aacdb545 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 28 May 2026 15:45:43 -0700
Subject: [PATCH 085/129] CI: Lustre 6.16 kernel compatibility fix (#18602)

Almalinux 9,10 kernels now include a backport of Linux commit
v6.15-13744-g41cb08555c41 which renames the from_timer() function
to timer_container_of().  Apply the upstream Lustre compatibility
patch to our builds.  This patch should be included in the next
Lustre release and can be dropped then.

ZFS-CI-Type: quick

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 .github/workflows/scripts/qemu-6-lustre-tests-vm.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh b/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh
index ff3f0a356bb..62917f4cb72 100755
--- a/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh
+++ b/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh
@@ -25,8 +25,14 @@ cd lustre-release
 
 # Include Lustre patches to build against master/zfs-2.4.x.  Once these
 # patches are merged we can remove these lines.
+#
+# LU-19539 osd-zfs: use osd_dmu_write() wrapper for xattrs
+# LU-19761 osd-zfs: Build against ZFS 2.4.0
+# LU-19249 build: Compatibility updates for kernel v6.16
+#
 patches=('https://review.whamcloud.com/changes/fs%2Flustre-release~62101/revisions/2/patch?download'
-	'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download')
+	'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download'
+	'https://review.whamcloud.com/changes/fs%2Flustre-release~60619/revisions/13/patch?download')
 
 for p in "${patches[@]}" ; do
 	curl $p | base64 -d > patch

From ec65e4b6bb01a892839e684479910d787b897b4d Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Fri, 29 May 2026 02:32:07 +0200
Subject: [PATCH 086/129] CI: skip smatch, zloop, and zfs-arm for
 documentation-only changes

Follow-up to #18518, which skipped the qemu matrix on doc-only PRs.
zloop, zfs-arm, and smatch are irrelevant to doc-only changes.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18601
---
 .github/workflows/smatch.yml  | 8 ++++++++
 .github/workflows/zfs-arm.yml | 8 ++++++++
 .github/workflows/zloop.yml   | 8 ++++++++
 3 files changed, 24 insertions(+)

diff --git a/.github/workflows/smatch.yml b/.github/workflows/smatch.yml
index ffad83b64ea..b6f47d8d41a 100644
--- a/.github/workflows/smatch.yml
+++ b/.github/workflows/smatch.yml
@@ -3,6 +3,14 @@ name: smatch
 on:
   push:
   pull_request:
+    paths-ignore:
+      - 'man/**'
+      - '**.md'
+      - 'AUTHORS'
+      - 'COPYRIGHT'
+      - 'LICENSE'
+      - 'NOTICE'
+      - '.gitignore'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/zfs-arm.yml b/.github/workflows/zfs-arm.yml
index fb91f198f6d..4166ad53d5c 100644
--- a/.github/workflows/zfs-arm.yml
+++ b/.github/workflows/zfs-arm.yml
@@ -3,6 +3,14 @@ name: zfs-arm
 on:
   push:
   pull_request:
+    paths-ignore:
+      - 'man/**'
+      - '**.md'
+      - 'AUTHORS'
+      - 'COPYRIGHT'
+      - 'LICENSE'
+      - 'NOTICE'
+      - '.gitignore'
   workflow_dispatch:
     inputs:
       gcc_ver:
diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml
index 7f76a670af9..edd2c391583 100644
--- a/.github/workflows/zloop.yml
+++ b/.github/workflows/zloop.yml
@@ -3,6 +3,14 @@ name: zloop
 on:
   push:
   pull_request:
+    paths-ignore:
+      - 'man/**'
+      - '**.md'
+      - 'AUTHORS'
+      - 'COPYRIGHT'
+      - 'LICENSE'
+      - 'NOTICE'
+      - '.gitignore'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

From eafa39fbc3dbbadeda320a9bf418eaab75a88e41 Mon Sep 17 00:00:00 2001
From: Timothy Day <tday141@gmail.com>
Date: Fri, 29 May 2026 16:40:14 +0000
Subject: [PATCH 087/129] build: add ZFS_DEBUG Kconfig for copy-builtin

... so we can toggle ZFS debug assertions from the
Linux kernel build without having to regenerate the
ZFS patch.

Update the qemu test script to also set this kernel
config.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Timothy Day <timday@thelustrecollective.com>
Co-authored-by: Timothy Day <timday@thelustrecollective.com>
Closes #18595
---
 .github/workflows/scripts/qemu-6-tests.sh |  1 +
 config/zfs-build.m4                       | 12 ++++++++++++
 copy-builtin                              | 11 +++++++++++
 module/Kbuild.in                          | 11 +++++++++--
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh
index c261cbfca06..41c34511357 100755
--- a/.github/workflows/scripts/qemu-6-tests.sh
+++ b/.github/workflows/scripts/qemu-6-tests.sh
@@ -79,6 +79,7 @@ function do_builtin_build() {
 
   cd $HOME/linux-$fullver
   ./scripts/config --enable ZFS
+  ./scripts/config --enable ZFS_DEBUG
   yes "" | make oldconfig
   make -j `nproc`
   ) &> /var/tmp/builtin.txt || rc=$?
diff --git a/config/zfs-build.m4 b/config/zfs-build.m4
index 1e1485c4cf4..7aabad3868b 100644
--- a/config/zfs-build.m4
+++ b/config/zfs-build.m4
@@ -39,6 +39,18 @@ dnl # (If INVARIANTS is detected, we need to force DEBUG, or strange panics
 dnl # can ensue.)
 dnl #
 AC_DEFUN([ZFS_AC_DEBUG], [
+	dnl #
+	dnl # In the Linux kernel copy-builtin build, assertion/debug support
+	dnl # is selected by CONFIG_ZFS_DEBUG (Kconfig).
+	dnl #
+	AH_BOTTOM([
+#ifdef CONFIG_ZFS
+#undef ZFS_DEBUG
+#ifdef CONFIG_ZFS_DEBUG
+#define ZFS_DEBUG 1
+#endif
+#endif])
+
 	AC_MSG_CHECKING([whether assertion support will be enabled])
 	AC_ARG_ENABLE([debug],
 		[AS_HELP_STRING([--enable-debug],
diff --git a/copy-builtin b/copy-builtin
index 9a430bfb289..d412437f556 100755
--- a/copy-builtin
+++ b/copy-builtin
@@ -43,6 +43,17 @@ config ZFS
 
 	  To compile this file system support as a module, choose M here.
 
+	  If unsure, say N.
+
+config ZFS_DEBUG
+	bool "ZFS debugging"
+	depends on ZFS
+	help
+	  Enable ZFS debugging. This turns on all ASSERT() assertions,
+	  enables additional debug-only code paths, and promotes
+	  compiler warnings to errors. This should only be enabled for
+	  development or troubleshooting.
+
 	  If unsure, say N.
 EOF
 
diff --git a/module/Kbuild.in b/module/Kbuild.in
index ff2c96b85ae..fa4085c84b0 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -4,9 +4,11 @@
 
 ZFS_MODULE_CFLAGS += -std=gnu11 -Wno-declaration-after-statement
 ZFS_MODULE_CFLAGS += -Wmissing-prototypes
-ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@
+ZFS_MODULE_CFLAGS += @KERNEL_NO_FORMAT_ZERO_LENGTH@
 
 ifneq ($(KBUILD_EXTMOD),)
+ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@
+ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
 zfs_include = @abs_top_srcdir@/include
 icp_include = @abs_srcdir@/icp/include
 zstd_include = @abs_srcdir@/zstd/include
@@ -16,6 +18,12 @@ ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include
 src = @abs_srcdir@
 obj = @abs_builddir@
 else
+ifeq ($(CONFIG_ZFS_DEBUG),y)
+ZFS_MODULE_CFLAGS += -Werror
+ZFS_MODULE_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+else
+ZFS_MODULE_CPPFLAGS += -UDEBUG -DNDEBUG
+endif
 zfs_include = $(srctree)/include/zfs
 icp_include = $(src)/icp/include
 zstd_include = $(src)/zstd/include
@@ -28,7 +36,6 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
 ZFS_MODULE_CFLAGS += -I$(zfs_include)
 ZFS_MODULE_CPPFLAGS += -D_KERNEL
-ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
 
 # KASAN enables -Werror=frame-larger-than=1024, which
 # breaks oh so many parts of our build.

From c90dc280898e6d95d30f4936e895998f8cebeb93 Mon Sep 17 00:00:00 2001
From: Alek P <alek-p@users.noreply.github.com>
Date: Fri, 29 May 2026 21:13:39 -0400
Subject: [PATCH 088/129] enforce exact decompressed length for lz4, gzip, and
 zstd

Decompressors must expand a ZFS block to exactly the expected number
of bytes. Treat decompression to an unexpected length as failure, so
truncated or short output is not accepted as valid decompression. This
makes our handling of decompress return values consistent with the
decompression functions' APIs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Alek Pinchuk <Alek.Pinchuk@connectwise.com>
Closes #18599
---
 module/zfs/gzip.c      |  8 ++++++--
 module/zfs/lz4_zfs.c   | 15 +++++++++++----
 module/zstd/zfs_zstd.c |  9 +++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c
index d183e998456..2dee3e1da78 100644
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@@ -96,13 +96,17 @@ zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
 	/* check if hardware accelerator can be used */
 	if (qat_dc_use_accel(d_len)) {
 		if (qat_compress(QAT_DECOMPRESS, s_start, s_len,
-		    d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS)
-			return (0);
+		    d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS) {
+			if ((size_t)dstlen == d_len)
+				return (0);
+		}
 		/* if hardware de-compress fail, do it again with software */
 	}
 
 	if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK)
 		return (-1);
+	if ((size_t)dstlen != d_len)
+		return (-1);
 
 	return (0);
 }
diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c
index 0c03a6855c7..672b1bd27e6 100644
--- a/module/zfs/lz4_zfs.c
+++ b/module/zfs/lz4_zfs.c
@@ -88,17 +88,24 @@ zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
 	(void) n;
 	const char *src = s_start;
 	uint32_t bufsiz = BE_IN32(src);
+	int decoded;
 
 	/* invalid compressed buffer size encoded at start */
 	if (bufsiz + sizeof (bufsiz) > s_len)
 		return (1);
 
 	/*
-	 * Returns 0 on success (decompression function returned non-negative)
-	 * and non-zero on failure (decompression function returned negative).
+	 * LZ4_uncompress_unknownOutputSize returns the number of bytes decoded
+	 * on success, or a negative value on failure. An OpenZFS block must
+	 * expand to exactly d_len bytes
 	 */
-	return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
-	    d_start, bufsiz, d_len) < 0);
+	decoded = LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+	    d_start, bufsiz, d_len);
+	if (decoded < 0)
+		return (1);
+	if (d_len != (size_t)decoded)
+		return (1);
+	return (0);
 }
 
 ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 82212055f0e..f38800f7f34 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -682,6 +682,15 @@ zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
 		return (1);
 	}
 
+	/*
+	 * An OpenZFS compressed block must expand to exactly d_len bytes.
+	 * ZSTD_decompressDCtx returns the decompressed size on success.
+	 */
+	if (result != d_len) {
+		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
+		return (1);
+	}
+
 	if (level) {
 		*level = curlevel;
 	}

From bfb914ca5836b18e085329d11f6fddcb72234202 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Mon, 1 Jun 2026 03:25:26 +0200
Subject: [PATCH 089/129] CI: apt-get update before purging host packages

The package removal ran against a stale package index and failed to
fetch a package that had been removed from the repository. Refresh
the index first.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18607
Closes #18609
---
 .github/workflows/scripts/qemu-1-setup.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/scripts/qemu-1-setup.sh b/.github/workflows/scripts/qemu-1-setup.sh
index 5c41a4d6a49..2e83b441588 100755
--- a/.github/workflows/scripts/qemu-1-setup.sh
+++ b/.github/workflows/scripts/qemu-1-setup.sh
@@ -17,6 +17,8 @@ sudo docker builder prune -a
 unneeded="microsoft-edge-stable|azure-cli|google-cloud|google-chrome-stable|"\
 "temurin|llvm|firefox|mysql-server|snapd|android|dotnet|haskell|ghcup|"\
 "powershell|julia|swift|miniconda|chromium"
+# refresh package index before removing packages
+sudo apt-get -y update
 sudo apt-get -y remove $(dpkg-query -f '${binary:Package}\n' -W | grep -E "'$unneeded'")
 sudo apt-get -y autoremove
 

From 20d56830f9d6c5a4c4ac0c5c2ad8626d2bf77c01 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Mon, 1 Jun 2026 03:27:40 +0200
Subject: [PATCH 090/129] CI: add concurrency support to zfs-arm

The zfs-arm workflow was the only build/test workflow without a
concurrency block, so superseded runs were not cancelled.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18608
---
 .github/workflows/zfs-arm.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/zfs-arm.yml b/.github/workflows/zfs-arm.yml
index 4166ad53d5c..84e1272f713 100644
--- a/.github/workflows/zfs-arm.yml
+++ b/.github/workflows/zfs-arm.yml
@@ -19,6 +19,10 @@ on:
         default: ""
         description: "(optional) install specific GCC version, like '16'"
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
   zfs-arm:
     name: ZFS ARM build

From 4bc8c39b62de7325c954acb3bd047f2bce1e8c55 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Mon, 1 Jun 2026 17:49:38 -0400
Subject: [PATCH 091/129] zed: Prefer dRAID distributed spares to regular ones

One of the main dRAID features is avoiding single drive bottlenecks
by using distributed spares.  Activation of regular spare will take
more time, during which the dRAID redundancy is even lower than in
case of RAIDZ.  But regular spares might still be added to the pool
as a second line of defence, possibly shared by several vdevs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18578
---
 cmd/zed/agents/zfs_retire.c                   | 30 ++++++++++++++-----
 module/zfs/vdev_label.c                       |  5 ++++
 .../functional/fault/auto_spare_multiple.ksh  |  8 ++---
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
index 0c6c30f2e86..ba3672a30a7 100644
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@@ -355,18 +355,30 @@ is_draid_fdomain_failure(fmd_hdl_t *hdl, libzfs_handle_t *zhdl,
  * replacing a failed vdev with the given characteristics.
  *
  * Ordering criteria (most to least significant):
- *  1. Matching rotational is preferred over mismatching.
- *  2. Large enough is preferred over (potentially?) too small.
- *  3. Smaller size is preferred over bigger (best fit).
+ *  1. Distributed spare matching the failed vdev's dRAID is preferred
+ *     most (distributed spares rebuild faster than traditional spares).
+ *     Regular spares (no TOP_GUID) come next.  Non-matching distributed
+ *     spares are tried last, as the kernel will reject them anyway.
+ *  2. Matching rotational is preferred over mismatching.
+ *  3. Large enough is preferred over too small.
+ *  4. Smaller size is preferred over bigger (best fit).
  */
 static boolean_t
 spare_is_preferred(nvlist_t *a, nvlist_t *b, boolean_t have_rotational,
-    uint64_t vdev_rotational, uint64_t vdev_size)
+    uint64_t vdev_rotational, uint64_t vdev_size, uint64_t top_guid)
 {
-	uint64_t a_rotational = 0, b_rotational = 0;
-	uint64_t a_size = 0, b_size = 0;
+	uint64_t a_top = 0, b_top = 0;
+	(void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_TOP_GUID, &a_top);
+	(void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_TOP_GUID, &b_top);
+	int a_pri = (a_top == 0) ? 1 :
+	    (a_top == top_guid || top_guid == 0) ? 2 : 0;
+	int b_pri = (b_top == 0) ? 1 :
+	    (b_top == top_guid || top_guid == 0) ? 2 : 0;
+	if (a_pri != b_pri)
+		return (a_pri > b_pri);
 
 	if (have_rotational) {
+		uint64_t a_rotational = 0, b_rotational = 0;
 		(void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_VDEV_ROTATIONAL,
 		    &a_rotational);
 		(void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_VDEV_ROTATIONAL,
@@ -378,6 +390,7 @@ spare_is_preferred(nvlist_t *a, nvlist_t *b, boolean_t have_rotational,
 
 	vdev_stat_t *vs;
 	unsigned int c;
+	uint64_t a_size = 0, b_size = 0;
 	if (nvlist_lookup_uint64_array(a, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0)
 		a_size = vs->vs_rsize;
@@ -405,7 +418,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 	char *dev_name;
 	zprop_source_t source;
 	int ashift;
-	uint64_t vdev_rotational = 0, vdev_size = 0;
+	uint64_t vdev_rotational = 0, vdev_size = 0, top_guid = 0;
 	boolean_t have_vdev_rotational;
 	vdev_stat_t *vs;
 	unsigned int c;
@@ -430,6 +443,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 	if (nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0)
 		vdev_size = vs->vs_rsize;
+	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_TOP_GUID, &top_guid);
 
 	/*
 	 * Build a sorted index array over the spares, so that better
@@ -443,7 +457,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 		int j = (int)s - 1;
 		while (j >= 0 && spare_is_preferred(spares[key],
 		    spares[order[j]], have_vdev_rotational, vdev_rotational,
-		    vdev_size)) {
+		    vdev_size, top_guid)) {
 			order[j + 1] = order[j];
 			j--;
 		}
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index b3042980aad..54d253c1b7d 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -467,6 +467,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+	    vd->vdev_top != NULL) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
+		    vd->vdev_top->vdev_guid);
+	}
 
 	if (vd->vdev_path != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh
index 023f5b58a6e..529a6a8c3fe 100755
--- a/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh
+++ b/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh
@@ -84,8 +84,8 @@ for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do
 		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
 		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
 		    spare $SPARE_DEV1
-		SPARE1=$SPARE_DEV1
-		SPARE2="draid2-0-0"
+		SPARE1="draid2-0-0"
+		SPARE2=$SPARE_DEV1
 	elif [ "$type" = "mirror" ]; then
 		# 1. Create a 3-way mirror pool with two hot spares
 		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
@@ -167,8 +167,8 @@ for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do
 		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
 		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
 		    spare $SPARE_DEV1
-		SPARE1=$SPARE_DEV1
-		SPARE2="draid2-0-0"
+		SPARE1="draid2-0-0"
+		SPARE2=$SPARE_DEV1
 	elif [ "$type" = "mirror" ]; then
 		# 1. Create a 3-way mirror pool with two hot spares
 		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS

From 59dc88602e23a436440e4164c6d9401da8f0dff2 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 1 Jun 2026 14:55:20 -0700
Subject: [PATCH 092/129] nvpair: Check for un-terminated strings in packed
 nvlist

Add additional checks to verify a packed string or string array nvpair
is terminated.  Or more specifically, verify doing a strlen() on the
prospective string does not overrun the packed nvlist buffer.

Also add additional checks in the libzfs_input_checks test case to
verify un-terminated strings, and add in a nvlist ioctl payload
fuzz test for good measure.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18604
---
 module/nvpair/nvpair.c                   |  49 +++++--
 module/zfs/zfs_ioctl.c                   |   3 +-
 tests/zfs-tests/cmd/libzfs_input_check.c | 172 ++++++++++++++++++++++-
 3 files changed, 207 insertions(+), 17 deletions(-)

diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c
index 07ac102145e..52678bb2bad 100644
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@@ -135,7 +135,8 @@
 #define	NVP_SIZE_CALC(name_len, data_len) \
 	(NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
 
-static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
+static int i_get_value_size(data_type_t type, const void *data, uint_t nelem,
+    size_t max_size);
 static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
     uint_t nelem, const void *data);
 
@@ -810,8 +811,10 @@ i_validate_nvpair(nvpair_t *nvp)
 	 * verify nvp_type, nvp_value_elem, and also possibly
 	 * verify string values and get the value size.
 	 */
-	size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
 	size1 = nvp->nvp_size - NVP_VALOFF(nvp);
+	size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp),
+	    size1);
+
 	if (size2 < 0 || size1 != NV_ALIGN(size2))
 		return (EFAULT);
 
@@ -1002,12 +1005,21 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
  * 	DATA_TYPE_STRING    	and
  *	DATA_TYPE_STRING_ARRAY
  * Is data == NULL then the size of the string(s) is excluded.
+ *
+ * If 'max_size' is non-zero, then don't look beyond 'max_size' number of
+ * bytes when calculating a value size. Note that 'max_size' should include
+ * the NULL terminator byte when calculating string size.  If 'max_size' is 0,
+ * it is ignored.
  */
 static int
-i_get_value_size(data_type_t type, const void *data, uint_t nelem)
+i_get_value_size(data_type_t type, const void *data, uint_t nelem,
+    size_t max_size)
 {
 	uint64_t value_sz;
 
+	if (max_size == 0)
+		max_size = INT32_MAX;
+
 	if (i_validate_type_nelem(type, nelem) != 0)
 		return (-1);
 
@@ -1052,10 +1064,15 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem)
 		break;
 #endif
 	case DATA_TYPE_STRING:
-		if (data == NULL)
+		if (data == NULL) {
 			value_sz = 0;
-		else
-			value_sz = strlen(data) + 1;
+		} else {
+			value_sz = strnlen(data, max_size);
+			if (value_sz >= max_size) {
+				return (-1);	/* string not terminated */
+			}
+			value_sz += 1;
+		}
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (boolean_t);
@@ -1089,16 +1106,23 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem)
 		break;
 	case DATA_TYPE_STRING_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t);
-
 		if (data != NULL) {
 			char *const *strs = data;
 			uint_t i;
+			size_t newsize;
 
 			/* no alignment requirement for strings */
 			for (i = 0; i < nelem; i++) {
 				if (strs[i] == NULL)
 					return (-1);
-				value_sz += strlen(strs[i]) + 1;
+
+				newsize = strnlen(strs[i], max_size);
+
+				if (newsize == max_size)
+					return (-1);	/* not terminated */
+
+				value_sz += newsize + 1; /* +1 for NULL */
+				max_size -= newsize + 1;
 			}
 		}
 		break;
@@ -1163,7 +1187,7 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) included.
 	 */
-	if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
+	if ((value_sz = i_get_value_size(type, data, nelem, 0)) < 0)
 		return (EINVAL);
 
 	if (i_validate_nvpair_value(type, nelem, data) != 0)
@@ -1588,7 +1612,7 @@ nvpair_value_common(const nvpair_t *nvp, data_type_t type, uint_t *nelem,
 #endif
 		if (data == NULL)
 			return (EINVAL);
-		if ((value_sz = i_get_value_size(type, NULL, 1)) < 0)
+		if ((value_sz = i_get_value_size(type, NULL, 1, 0)) < 0)
 			return (EINVAL);
 		memcpy(data, NVP_VALUE(nvp), (size_t)value_sz);
 		if (nelem != NULL)
@@ -3019,7 +3043,8 @@ nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) excluded.
 	 */
-	if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
+	if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp),
+	    NVP_SIZE(nvp))) < 0)
 		return (EFAULT);
 
 	if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
@@ -3333,7 +3358,7 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) excluded.
 	 */
-	if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
+	if ((value_sz = i_get_value_size(type, NULL, nelem, NVP_SIZE(nvp)) < 0))
 		return (EFAULT);
 
 	/* if there is no data to extract then return */
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index d31aa80641c..7013fbfb64f 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4126,7 +4126,6 @@ static int
 zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) unused, (void) outnvl;
-	const char *message;
 	char *poolname;
 	spa_t *spa;
 	int error;
@@ -4147,7 +4146,7 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 	if (error != 0)
 		return (error);
 
-	message = fnvlist_lookup_string(innvl, "message");
+	const char *message = fnvlist_lookup_string(innvl, "message");
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c
index 4ef249bbd4a..8f7e36d9efa 100644
--- a/tests/zfs-tests/cmd/libzfs_input_check.c
+++ b/tests/zfs-tests/cmd/libzfs_input_check.c
@@ -85,7 +85,6 @@ static const zfs_ioc_t ioc_skip[] = {
 	ZFS_IOC_DSOBJ_TO_DSNAME,
 	ZFS_IOC_OBJ_TO_PATH,
 	ZFS_IOC_POOL_SET_PROPS,
-	ZFS_IOC_POOL_GET_PROPS,
 	ZFS_IOC_SET_FSACL,
 	ZFS_IOC_GET_FSACL,
 	ZFS_IOC_SHARE,
@@ -125,11 +124,136 @@ static const zfs_ioc_t ioc_skip[] = {
 		lzc_ioctl_test(ioc, name, req, opt, err, wild);	\
 	} while (0)
 
+#define	IOC_INPUT_TEST_INJECT(ioc, name, innvl)			\
+	do {							\
+		active_test = __func__ + 5;			\
+		lzc_ioctl_run_impl(ioc, name, innvl, 0, B_TRUE);	\
+	} while (0)
+
+/*
+ * Given a zfs_cmd_t containing an already packed nvlist in zc->zc_nvlist_src,
+ * and its original innvl, look in innvl for the last string nvpair, or last
+ * string array nvpair, and remove the string terminator.  The idea is to
+ * corrupt the nvlist string value so that anyone doing a strlen() on it will
+ * read past the end of the packed nvlist buffer and trigger a crash.
+ */
+static void
+do_bad_string(zfs_cmd_t *zc, nvlist_t *innvl)
+{
+	nvpair_t *elem = NULL;
+	nvpair_t *lastseen = NULL;
+	const char *str = NULL;
+	const char **arr;
+	uint_t n;
+	char *off;
+	char *packed;
+	uint64_t size, off_size;
+
+	while ((elem = nvlist_next_nvpair(innvl, elem)) != NULL) {
+		if ((nvpair_type(elem) == DATA_TYPE_STRING) ||
+		    (nvpair_type(elem) == DATA_TYPE_STRING_ARRAY))
+			lastseen = elem;
+	}
+
+	if (lastseen == NULL)
+		return;	/* No strings */
+
+	/*
+	 * Lookup either the last string, or the last string in the last
+	 * string array in the nvlist.  We will use this to corrupt from the
+	 * string to the end of the nvlist buffer.  Any attempts to strlen this
+	 * string should run pass the end of the packed buffer.
+	 */
+	if (nvpair_value_string(lastseen, &str) != 0) {
+		if (nvpair_value_string_array(lastseen, &arr, &n) == 0)
+			str = arr[n-1];
+	}
+
+	/*
+	 * We now have the last string.  Corrupt everything from the NULL
+	 * terminator byte for the last string to the end of the packed nvlist
+	 * buffer.
+	 */
+	packed = (char *)zc->zc_nvlist_src;
+	size = zc->zc_nvlist_src_size;
+
+	off = memmem(packed, size, str, strlen(str));
+	off_size = strlen(str);
+
+	memset(&off[off_size - 1], '!', (packed + size) -
+	    (&off[off_size - 1]));
+
+}
+
+/*
+ * For each byte in the packed nvlist list in zc, corrupt a single byte, then
+ * try doing the ioctl.  This tests how well the kernel handles fuzzed nvlists.
+ *
+ * NOTE - make sure you are doing this with a "safe" ioctl!  You don't want to
+ * run this on an ioctl that can potentially corrupt data (like a zpool create).
+ */
+static void
+do_fuzz(int zfs_fd, zfs_ioc_t ioc, zfs_cmd_t *zc)
+{
+	uint64_t size;
+	uint64_t i;
+	unsigned char old = 0;
+	unsigned char *pos;
+	zfs_cmd_t orig_zc = *zc;
+
+	pos = (unsigned char *) zc->zc_nvlist_src;
+	size = zc->zc_nvlist_src_size;
+
+	/*
+	 * Fuzz each byte in the packed nvlist, one byte at a time, and do the
+	 * ioctl.  If the kernel doesn't crash, then the test passed.
+	 */
+	for (i = 0; i < size; i++) {
+		/* Restore the previously corrupted byte */
+		if (i > 0)
+			pos[i-1] = old;
+
+		old = pos[i];
+
+		/* Corrupt the new byte */
+		pos[i]++;
+
+		/*
+		 * Do the ioctl and ignore the return code.  We just want to
+		 * see if the kernel panics.
+		 */
+		lzc_ioctl_fd(zfs_fd, ioc, zc);
+
+		/*
+		 * Restore 'zc' with original fields since the ioctl may
+		 * have modified them.
+		 */
+		*zc = orig_zc;
+	}
+	/* Restore last byte */
+	if (i > 0)
+		pos[i - 1] = old;
+
+	/*
+	 * Try fuzzing the packed nvlist size field.  Test it with one byte
+	 * bigger and one byte smaller than the current value.
+	 */
+	zc->zc_nvlist_src_size--;
+	lzc_ioctl_fd(zfs_fd, ioc, zc);
+
+	zc->zc_nvlist_src_size += 2;
+	lzc_ioctl_fd(zfs_fd, ioc, zc);
+
+	/* Restore to normal */
+	zc->zc_nvlist_src_size -= 1;
+}
+
 /*
  * run a zfs ioctl command, verify expected results and log failures
  */
 static void
-lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected)
+lzc_ioctl_run_impl(zfs_ioc_t ioc, const char *name, nvlist_t *innvl,
+    int expected, boolean_t do_corrupt)
 {
 	zfs_cmd_t zc = {"\0"};
 	char *packed = NULL;
@@ -160,10 +284,30 @@ lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected)
 	zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
 	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)malloc(zc.zc_nvlist_dst_size);
 
+	if (do_corrupt) {
+		/*
+		 * Try changing bytes in the packed nvlist to see if it will
+		 * panic the kernel when you do the ioctl.
+		 */
+		do_fuzz(zfs_fd, ioc, &zc);
+
+		/*
+		 * Corrupt the last string in the packed nvlist so it has no
+		 * NULL terminator.
+		 */
+		do_bad_string(&zc, innvl);
+
+	}
+
 	if (lzc_ioctl_fd(zfs_fd, ioc, &zc) != 0)
 		error = errno;
 
-	if (error != expected) {
+	/*
+	 * If we're corrupting the nvlist we don't care about the specific
+	 * error code that gets returned, as it could be one of many.  We only
+	 * care if it panics the kernel.
+	 */
+	if (!do_corrupt && error != expected) {
 		unexpected_failures = B_TRUE;
 		(void) fprintf(stderr, "%s: Unexpected result with %s, "
 		    "error %d (expecting %d)\n",
@@ -174,6 +318,12 @@ lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected)
 	free((void *)(uintptr_t)zc.zc_nvlist_dst);
 }
 
+static void
+lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected)
+{
+	return (lzc_ioctl_run_impl(ioc, name, innvl, expected, B_FALSE));
+}
+
 /*
  * Test each ioc for the following ioctl input errors:
  *   ZFS_ERR_IOC_ARG_UNAVAIL	an input argument is not supported by kernel
@@ -310,6 +460,7 @@ test_log_history(const char *pool)
 	fnvlist_add_string(required, "message", "input check");
 
 	IOC_INPUT_TEST(ZFS_IOC_LOG_HISTORY, pool, required, NULL, 0);
+	IOC_INPUT_TEST_INJECT(ZFS_IOC_LOG_HISTORY, pool, required);
 
 	nvlist_free(required);
 }
@@ -791,6 +942,20 @@ test_set_bootenv(const char *pool)
 	nvlist_free(required);
 }
 
+static void
+test_zpool_get(const char *pool)
+{
+	const char *strs[] = {ZPOOL_DEDUPCACHED_PROP_NAME};
+	nvlist_t *optional = fnvlist_alloc();
+
+	fnvlist_add_string_array(optional, ZPOOL_GET_PROPS_NAMES, strs, 1);
+
+	IOC_INPUT_TEST(ZFS_IOC_POOL_GET_PROPS, pool, NULL, optional, 0);
+	IOC_INPUT_TEST_INJECT(ZFS_IOC_POOL_GET_PROPS, pool, optional);
+
+	nvlist_free(optional);
+}
+
 static void
 zfs_ioc_input_tests(const char *pool)
 {
@@ -885,6 +1050,7 @@ zfs_ioc_input_tests(const char *pool)
 
 	test_scrub(pool);
 
+	test_zpool_get(pool);
 	/*
 	 * cleanup
 	 */

From 037368b124fa18df9ed4a1f2582879d43481d9f5 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Tue, 2 Jun 2026 18:33:22 +0200
Subject: [PATCH 093/129] metaslab: expose condense_pct and sm_blksz tunables
 on Linux

Expose zfs_metaslab_condense_pct and zfs_metaslab_sm_blksz_* as
module parameters on Linux, matching their existing FreeBSD sysctls.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18594
---
 include/sys/metaslab_impl.h       |  2 +-
 man/man4/zfs.4                    | 26 ++++++++++++++++++++
 module/os/freebsd/zfs/sysctl_os.c | 40 -------------------------------
 module/zfs/metaslab.c             | 23 ++++++++++++++----
 4 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index faeb96fe965..44a4d4ddf75 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -330,7 +330,7 @@ struct metaslab_group {
  *
  * As the space map grows (as a result of the appends) it will
  * eventually become space-inefficient.  When the metaslab's in-core
- * free tree is zfs_condense_pct/100 times the size of the minimal
+ * free tree is zfs_metaslab_condense_pct/100 times the size of the minimal
  * on-disk representation, we rewrite it in its minimized form.  If a
  * metaslab needs to condense then we must set the ms_condensing flag to
  * ensure that allocations are not performed on the metaslab that is
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 657070de02a..09195b03e1a 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -446,6 +446,32 @@ This improves performance, especially when there are many metaslabs per vdev
 and the allocation can't actually be satisfied
 (so we would otherwise iterate all metaslabs).
 .
+.It Sy zfs_metaslab_sm_blksz_no_log Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq int
+Block size for the metaslab space maps in pools where the
+.Sy log_spacemap
+feature is disabled.
+Multiple metaslabs are modified per transaction group, so a smaller block size
+lets more, scattered I/O operations be issued.
+Must be a power of 2 greater than
+.Sy 4096 .
+This parameter can only be set at module load time.
+.
+.It Sy zfs_metaslab_sm_blksz_with_log Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int
+Block size for the metaslab space maps in pools where the
+.Sy log_spacemap
+feature is enabled.
+Changes are batched in the per-pool log spacemap and flushed to each metaslab's
+space map only occasionally, so a larger block size is more efficient.
+Must be a power of 2 greater than
+.Sy 4096 .
+This parameter can only be set at module load time.
+.
+.It Sy zfs_metaslab_condense_pct Ns = Ns Sy 200 Ns % Pq uint
+Condense an on-disk space map when its size exceeds this percentage of
+the in-memory representation.
+The minimum is
+.Sy 100 .
+.
 .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint
 When a vdev is added, target this number of metaslabs per top-level vdev.
 .
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index eb7ada12e14..a0a721aec20 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -289,46 +289,6 @@ param_set_active_allocator(SYSCTL_HANDLER_ARGS)
 	return (param_set_active_allocator_common(buf));
 }
 
-/*
- * In pools where the log space map feature is not enabled we touch
- * multiple metaslabs (and their respective space maps) with each
- * transaction group. Thus, we benefit from having a small space map
- * block size since it allows us to issue more I/O operations scattered
- * around the disk. So a sane default for the space map block size
- * is 8~16K.
- */
-extern int zfs_metaslab_sm_blksz_no_log;
-
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log,
-	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0,
-	"Block size for space map in pools with log space map disabled.  "
-	"Power of 2 greater than 4096.");
-
-/*
- * When the log space map feature is enabled, we accumulate a lot of
- * changes per metaslab that are flushed once in a while so we benefit
- * from a bigger block size like 128K for the metaslab space maps.
- */
-extern int zfs_metaslab_sm_blksz_with_log;
-
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log,
-	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0,
-	"Block size for space map in pools with log space map enabled.  "
-	"Power of 2 greater than 4096.");
-
-/*
- * The in-core space map representation is more compact than its on-disk form.
- * The zfs_condense_pct determines how much more compact the in-core
- * space map representation must be before we compact it on-disk.
- * Values should be greater than or equal to 100.
- */
-extern uint_t zfs_condense_pct;
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
-	CTLFLAG_RWTUN, &zfs_condense_pct, 0,
-	"Condense on-disk spacemap when it is more than this many percents"
-	" of in-memory counterpart");
-
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 6ea3ecd74fc..959aa1b8384 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -82,11 +82,11 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
- * The zfs_condense_pct determines how much more compact the in-core
+ * The zfs_metaslab_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
-uint_t zfs_condense_pct = 200;
+uint_t zfs_metaslab_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
@@ -3826,8 +3826,8 @@ metaslab_group_preload(metaslab_group_t *mg)
  *    increase as a result of writing out the free space range tree.
  *
  * 2. Condense if the on on-disk space map representation is at least
- *    zfs_condense_pct/100 times the size of the optimal representation
- *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
+ *    zfs_metaslab_condense_pct/100 times the size of the optimal representation
+ *    (i.e. zfs_metaslab_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
  * 3. Do not condense if the on-disk size of the space map does not actually
  *    decrease.
@@ -3863,7 +3863,8 @@ metaslab_should_condense(metaslab_t *msp)
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
-	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	return (object_size >=
+	    (optimal_size * zfs_metaslab_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
@@ -6454,6 +6455,18 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
 
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, sm_blksz_no_log, INT, ZMOD_RW,
+	"Block size for space map in pools with log space map disabled.  "
+	"Power of 2 greater than 4096.");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, sm_blksz_with_log, INT, ZMOD_RW,
+	"Block size for space map in pools with log space map enabled.  "
+	"Power of 2 greater than 4096.");
+
 ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
 	param_set_active_allocator, param_get_charp, ZMOD_RW,
 	"SPA active allocator");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, condense_pct, UINT, ZMOD_RW,
+	"Condense on-disk spacemap when it is more than this many percents "
+	"of in-memory counterpart");

From 68980eb105eac9936604c75e8838941e2addaf23 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Tue, 2 Jun 2026 17:54:04 +1000
Subject: [PATCH 094/129] dsl_scan: close errorscrub cursor on pause

If the cursor were ever to actively hold resources, not finalising it
would mean leaking those resources whenever the scrub is paused.

The cursor is already reinitialized from the stored serialized form
if/when it is resumed, so there's nothing we need from the old one, just
to release it.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
---
 module/zfs/dsl_scan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 6f5dfac7b9d..03e13ca96cc 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1280,6 +1280,7 @@ dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 		spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
 		scn->errorscrub_phys.dep_paused_flags = B_TRUE;
 		dsl_errorscrub_sync_state(scn, tx);
+		zap_cursor_fini(&scn->errorscrub_cursor);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);

From efda1093ffa89a7b165a776c3d30ea44a13ee361 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 14:02:35 +1000
Subject: [PATCH 095/129] zap: add zap_cursor_init_by_dnode() & rework cursor
 resource lifetime

This commit adds zap_cursor_init_by_dnode() (and
zap_cursor_init_serialized_by_dnode()), which allow the target ZAP to
provided via an existing dnode rather than the traditional objset+object
pair.

This requires some reorganisation of the way that zap_cursor_t is
initialised. Up until now, zap_cursor_init() has merely stored the
objset, object, serialized form and prefetch flag, and left it until
zap_cursor_retrieve() to actually call zap_lock(). This makes a
_by_dnode() form complicated, because it is a held resource that needs
to be released, but might not be used if zap_cursor_retrieve() is not
called. So there's a bunch of state tracking required.

However, all cursor users immediately follow zap_cursor_init() with
zap_cursor_retrieve(), so there's nothing gained by delaying holds. This
allows us to simplify things, by calling zap_lock() directly in
zap_cursor_init() and retaining it until zap_cursor_fini().

This does however means the _init() functions are now fallible, and can
return an error. This adds complexity to most of the call sites, which
are typically in a for loop of the form:

    for (zap_cursor_init(...);
      zap_cursor_retrieve(...) == 0;
      zap_cursor_advance(...))

To avoid needing to make significant changes at every call site, a
failed _init() call will also zero the cursor struct. If the caller
doesn't check the return and continues to zap_cursor_retrieve(), they
will get an EIO return, and zap_cursor_fini() will just return.

The existing zc_objset and zc_zapobj fields are retained to support
source backcompat for Lustre, which inspects them directly.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
---
 include/sys/zap.h |  19 +++++---
 module/zfs/zap.c  | 118 +++++++++++++++++++++++++++++-----------------
 2 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index 7e89ad7d3de..ad20d427ad9 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -443,16 +443,20 @@ void zap_attribute_free(zap_attribute_t *attrp);
 
 struct zap;
 struct zap_leaf;
+
 typedef struct zap_cursor {
 	/* This structure is opaque! */
-	objset_t *zc_objset;
 	struct zap *zc_zap;
 	struct zap_leaf *zc_leaf;
-	uint64_t zc_zapobj;
-	uint64_t zc_serialized;
 	uint64_t zc_hash;
 	uint32_t zc_cd;
 	boolean_t zc_prefetch;
+	/*
+	 * Legacy fields to main source compat with Lustre, which accesses
+	 * them directly. Not to be used in new code!
+	 */
+	objset_t *zc_objset;
+	uint64_t zc_zapobj;
 } zap_cursor_t;
 
 /*
@@ -460,14 +464,15 @@ typedef struct zap_cursor {
  * The entire zapobj will be prefetched. You must call zap_cursor_fini the
  * cursor when you are done with it.
  */
-void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
+int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
+int zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn);
 void zap_cursor_fini(zap_cursor_t *zc);
 
 /*
  * Initialize a cursor at the beginning, but request that we not prefetch
  * the entire ZAP object.
  */
-void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
+int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
     uint64_t zapobj);
 
 /*
@@ -477,8 +482,10 @@ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
  * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
  * zap_cursor_init(...).)
  */
-void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,
+int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,
     uint64_t zapobj, uint64_t serialized);
+int zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,
+    uint64_t serialized);
 
 /*
  * Get the attribute currently pointed to by the cursor.  Returns
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index caed9c67794..ee94917d8e8 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -1072,53 +1072,100 @@ zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 
 /* zap_cursor */
 
-static void
-zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+static int
+zap_cursor_init_by_dnode_impl(zap_cursor_t *zc, dnode_t *dn,
     uint64_t serialized, boolean_t prefetch)
 {
-	zc->zc_objset = os;
 	zc->zc_zap = NULL;
 	zc->zc_leaf = NULL;
-	zc->zc_zapobj = zapobj;
-	zc->zc_serialized = serialized;
-	zc->zc_hash = 0;
-	zc->zc_cd = 0;
+
+	int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    zc, &zc->zc_zap);
+	if (err != 0)
+		return (err);
+
 	zc->zc_prefetch = prefetch;
+	zc->zc_objset = dn->dn_objset;
+	zc->zc_zapobj = dn->dn_object;
+
+	int hb = zap_hashbits(zc->zc_zap);
+	zc->zc_hash = serialized << (64 - hb);
+	zc->zc_cd = serialized >> hb;
+	if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+		zc->zc_cd = 0;
+
+	/*
+	 * Drop ZAP read lock, but keep the hold, so the holds on the
+	 * underlying dnode and header dbuf are maintained.
+	 */
+	rw_exit(&zc->zc_zap->zap_rwlock);
+
+	return (0);
 }
 
-void
+static int
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized, uint32_t prefetch)
+{
+	dnode_t *dn = NULL;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0) {
+		zc->zc_zap = NULL;
+		zc->zc_leaf = NULL;
+		return (err);
+	}
+
+	err = zap_cursor_init_by_dnode_impl(zc, dn, serialized, prefetch);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+int
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
-	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+	return (zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE));
 }
 
-void
+int
+zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn)
+{
+	return (zap_cursor_init_by_dnode_impl(zc, dn, 0, B_TRUE));
+}
+
+int
 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
-	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
+	return (zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE));
 }
 
-void
+int
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
 {
-	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+	return (zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE));
+}
+
+int
+zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,
+    uint64_t serialized)
+{
+	return (zap_cursor_init_by_dnode_impl(zc, dn, serialized, B_TRUE));
 }
 
 void
 zap_cursor_fini(zap_cursor_t *zc)
 {
-	if (zc->zc_zap) {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlock(zc->zc_zap, NULL);
-		zc->zc_zap = NULL;
-	}
 	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
 	}
-	zc->zc_objset = NULL;
+	if (zc->zc_zap) {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+		zap_unlock(zc->zc_zap, zc);
+	}
+	memset(zc, 0, sizeof (zap_cursor_t));
 }
 
 int
@@ -1126,30 +1173,15 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err;
 
+	if (zc->zc_zap == NULL)
+		/* zap_cursor_init failed, cursor is invalid */
+		return (SET_ERROR(EIO));
+
 	if (zc->zc_hash == -1ULL)
 		return (SET_ERROR(ENOENT));
 
-	if (zc->zc_zap == NULL) {
-		int hb;
-		err = zap_lock(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
-		if (err != 0)
-			return (err);
+	rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 
-		/*
-		 * To support zap_cursor_init_serialized, advance, retrieve,
-		 * we must add to the existing zc_cd, which may already
-		 * be 1 due to the zap_cursor_advance.
-		 */
-		ASSERT0(zc->zc_hash);
-		hb = zap_hashbits(zc->zc_zap);
-		zc->zc_hash = zc->zc_serialized << (64 - hb);
-		zc->zc_cd += zc->zc_serialized >> hb;
-		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
-			zc->zc_cd = 0;
-	} else {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-	}
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
@@ -1184,6 +1216,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 			err = SET_ERROR(ENOENT);
 		}
 	}
+
 	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
@@ -1199,10 +1232,9 @@ zap_cursor_advance(zap_cursor_t *zc)
 uint64_t
 zap_cursor_serialize(zap_cursor_t *zc)
 {
-	if (zc->zc_hash == -1ULL)
+	if (zc->zc_zap == NULL || zc->zc_hash == -1ULL)
 		return (-1ULL);
-	if (zc->zc_zap == NULL)
-		return (zc->zc_serialized);
+
 	ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
 	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
 

From 8f933f53e23372edab2d2e9a550b89ee9188618b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 29 May 2026 14:38:48 +1000
Subject: [PATCH 096/129] unit/mock_dmu: track dnode refcount changes

The thing under test will be taking and releasing dnode refs/holds. By
counting them and exposing the current count, we can assert in test
cleanup that we haven't missed releasing any, especially in cases where
the hold is held across multiple test steps.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
---
 tests/unit/mock_dmu.c | 20 ++++++++++++++++++--
 tests/unit/mock_dmu.h |  3 +++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tests/unit/mock_dmu.c b/tests/unit/mock_dmu.c
index 65c38c1fd9f..ae035498da6 100644
--- a/tests/unit/mock_dmu.c
+++ b/tests/unit/mock_dmu.c
@@ -28,6 +28,7 @@
 #include <sys/zfeature.h>
 
 #include "mock_dmu.h"
+#include "unit.h"
 
 /*
  * A mock dbuf. A real dmu_buf_t (first for casting) plus the attached user
@@ -48,6 +49,7 @@ typedef struct mock_dbuf mock_dbuf_t;
  */
 struct mock_dnode {
 	dnode_t			mdn_dn;
+	uint64_t		mdn_refcount;
 	size_t			mdn_blksize;
 	size_t			mdn_nblocks;
 	mock_dbuf_t		**mdn_blocks;
@@ -110,6 +112,7 @@ mock_dnode_create(size_t blksize, dmu_object_type_t type)
 	ASSERT(IS_P2ALIGNED(blksize, 512));
 
 	mock_dnode_t *mdn = kmem_zalloc(sizeof (mock_dnode_t), KM_SLEEP);
+	mdn->mdn_refcount = 1;
 	mdn->mdn_dn.dn_type = type;
 	mdn->mdn_dn.dn_object = 1;	/* arbitrary non-zero object number */
 	mdn->mdn_blksize = blksize;
@@ -156,6 +159,12 @@ mock_dnode_block_data(mock_dnode_t *mdn, uint64_t blkid)
 	return (mdn->mdn_blocks[blkid]->mdb_db.db_data);
 }
 
+uint64_t
+mock_dnode_refcount(mock_dnode_t *mdn)
+{
+	return (mdn->mdn_refcount);
+}
+
 /* Mock transaction */
 
 mock_dmu_tx_t *
@@ -258,14 +267,21 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
 boolean_t
 dnode_add_ref(dnode_t *dn, const void *tag)
 {
-	(void) dn; (void) tag;
+	(void) tag;
+	mock_dnode_t *mdn = (mock_dnode_t *)dn;
+	if (mdn->mdn_refcount == 0)
+		return (B_FALSE);
+	mdn->mdn_refcount++;
 	return (B_TRUE);
 }
 
 void
 dnode_rele(dnode_t *dn, const void *tag)
 {
-	(void) dn; (void) tag;
+	(void) tag;
+	mock_dnode_t *mdn = (mock_dnode_t *)dn;
+	unit_gt(mdn->mdn_refcount, 0);
+	mdn->mdn_refcount--;
 }
 
 /*
diff --git a/tests/unit/mock_dmu.h b/tests/unit/mock_dmu.h
index a46454c779f..2ac82c18b7a 100644
--- a/tests/unit/mock_dmu.h
+++ b/tests/unit/mock_dmu.h
@@ -40,6 +40,9 @@ size_t mock_dnode_block_count(mock_dnode_t *mdn);
 /* Returns a pointer to the data under the given block id. */
 const void *mock_dnode_block_data(mock_dnode_t *mdn, uint64_t blkid);
 
+/* Returns the current dnode ref (hold) count. */
+uint64_t mock_dnode_refcount(mock_dnode_t *mdn);
+
 /* Create/destroy a mock transaction handle. */
 mock_dmu_tx_t *mock_tx_create(void);
 void mock_tx_destroy(mock_dmu_tx_t *tx);

From a7170d144e9dab2f8704ba8c09b76ab26ed676d2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 29 May 2026 14:43:53 +1000
Subject: [PATCH 097/129] unit/zap: check mock dnode refcount before
 destruction

It should be back at 1, where it started.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
---
 tests/unit/test_zap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index 5d53e49b2c7..5d9d78d0a26 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -103,7 +103,9 @@ mock_zap_is_fatzap(dnode_t *dn)
 static void
 mock_zap_destroy(dnode_t *dn)
 {
-	mock_dnode_destroy((mock_dnode_t *)dn);
+	mock_dnode_t *mdn = (mock_dnode_t *)dn;
+	unit_eq(mock_dnode_refcount(mdn), 1);
+	mock_dnode_destroy(mdn);
 }
 
 /* Create a ZAP of the type named in the given test params. */

From 49b71917a6c99fb53efdefdf7306ec7d2483d900 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 6 May 2026 11:10:52 +1000
Subject: [PATCH 098/129] unit/zap: basic cursor tests

These add a bunch of entries to the ZAP, and then ensure that a cursor
walk over the ZAP sees them all once and once only, and no others.

The serialization test takes it a bit further, by serializing and
recreating the cursor half way through and confirming it correctly picks
up from the same spot, and then recreating the cursor from serialized
again and confirming that it also see only the second set of entries.
This ensures that the serialized cursor state is fully self contained
and not reliant on anything left over in the ZAP itself at serialization
time.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
---
 tests/unit/test_zap.c | 213 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index 5d9d78d0a26..a08a899b794 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -517,6 +517,216 @@ test_fatzap_stats(const MunitParameter params[], void *data)
 
 /* ========== */
 
+/* Cursor tests. */
+
+/*
+ * Basic cursor test. Add a bunch of keys+values to a ZAP, read them back
+ * via cursor, confirm they're all there and nothing else is.
+ */
+static MunitResult
+test_cursor(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* For each ASCII letter as key, add a unique value to the ZAP. */
+	for (int i = 0; i < 26; i++) {
+		char c = (char)i + 'a';
+		char k[2] = { c, '\0' };
+		uint64_t v = (uint64_t)c * 11;
+		unit_ok(zap_add_by_dnode(dn, k, sizeof (uint64_t), 1, &v, tx));
+	}
+
+	/* Sanity check; confirm they're all there by count. */
+	uint64_t count = 0;
+	unit_ok(zap_count_by_dnode(dn, &count));
+	unit_eq(count, 26);
+
+	zap_cursor_t zc;
+	zap_attribute_t *za = zap_attribute_alloc();
+
+	unit_ok(zap_cursor_init_by_dnode(&zc, dn));
+
+	/*
+	 * Cursors don't guarantee an order, so we run over them them all,
+	 * confirm the key matches the value, and then set a bit for each
+	 * one we've seen. By the end, we should have seen them all.
+	 */
+	uint64_t seen = 0;
+	for (int i = 0; i < 26; i++) {
+		unit_ok(zap_cursor_retrieve(&zc, za));
+
+		/* Confirm attribute has the right details for the value. */
+		unit_eq(za->za_integer_length, sizeof (uint64_t));
+		unit_eq(za->za_num_integers, 1);
+
+		/*
+		 * And the right key in za_name. Note that we don't check
+		 * za_name_len, which is the length of a buffer that can
+		 * definitely hold the key, not the key length itself.
+		 */
+		char c = za->za_name[0];
+		unit_true(c >= 'a' && c <= 'z');
+		unit_zero(za->za_name[1]);
+
+		/* Check the value in the attribute. */
+		uint64_t v = (uint64_t)c * 11;
+		unit_eq(za->za_first_integer, v);
+
+		/*
+		 * Also do a direct lookup and confirm the value matches
+		 * the value from the attribute.
+		 */
+		char k[2] = { c, '\0' };
+		uint64_t result = 0;
+		unit_ok(zap_lookup_by_dnode(dn, k,
+		    sizeof (uint64_t), 1, &result));
+		unit_eq(result, v);
+
+		/* This one is good, set the bit to remember this fact. */
+		seen |= 1 << (c-'a');
+
+		zap_cursor_advance(&zc);
+	}
+
+	/* There should be no more keys in the ZAP. */
+	unit_err(zap_cursor_retrieve(&zc, za), ENOENT);
+
+	/* Bits 0-25 should be set if we've seen them all. */
+	unit_eq(seen, (1 << 26) - 1);
+
+	zap_attribute_free(za);
+	zap_cursor_fini(&zc);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/*
+ * Cursor serialize test. Add a bunch of items, use the cursor to read half of
+ * them back, then serialize the cursor. Reload the cursor from the serialized
+ * state and confirm that we pick up where we left off. Then do it again to
+ * ensure it doesn't rely on any internal state.
+ */
+static MunitResult
+test_cursor_serialize(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* For each ASCII letter as key, add a unique value to the ZAP. */
+	for (int i = 0; i < 26; i++) {
+		char c = (char)i + 'a';
+		char k[2] = { c, '\0' };
+		uint64_t v = (uint64_t)c * 11;
+		unit_ok(zap_add_by_dnode(dn, k, sizeof (uint64_t), 1, &v, tx));
+	}
+
+	/* Sanity check; confirm they're all there by count. */
+	uint64_t count = 0;
+	unit_ok(zap_count_by_dnode(dn, &count));
+	unit_eq(count, 26);
+
+	/*
+	 * Like test_cursor above, we'll walk over the ZAP and set bits
+	 * for each key we see.
+	 */
+	zap_cursor_t zc;
+	zap_attribute_t *za = zap_attribute_alloc();
+	uint64_t seen = 0;
+
+	unit_ok(zap_cursor_init_by_dnode(&zc, dn));
+	for (int i = 0; i < 13; i++) {
+		unit_ok(zap_cursor_retrieve(&zc, za));
+
+		char c = za->za_name[0];
+		unit_true(c >= 'a' && c <= 'z');
+
+		/* This one is good, set the bit to remember this fact. */
+		seen |= 1 << (c-'a');
+
+		zap_cursor_advance(&zc);
+	}
+
+	/* Serialise the and terminate the cursor. */
+	uint64_t cookie = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+
+	/*
+	 * Record the bits we saw in the first iteration; we'll use this
+	 * when we reload the cursor a second time below.
+	 */
+	uint64_t orig_seen = seen;
+
+	/* Reinitialise the cursor from the cookie. */
+	unit_ok(zap_cursor_init_serialized_by_dnode(&zc, dn, cookie));
+
+	/* Loop over the remaining entries and track them. */
+	for (int i = 0; i < 13; i++) {
+		unit_ok(zap_cursor_retrieve(&zc, za));
+
+		char c = za->za_name[0];
+		unit_true(c >= 'a' && c <= 'z');
+
+		/* This one is good, set the bit to remember this fact. */
+		seen |= 1 << (c-'a');
+
+		zap_cursor_advance(&zc);
+	}
+
+	/* There should be no more keys in the ZAP. */
+	unit_err(zap_cursor_retrieve(&zc, za), ENOENT);
+
+	/* Bits 0-25 should be set if we've seen them all. */
+	unit_eq(seen, (1 << 26) - 1);
+
+	/* Cursor done. */
+	zap_cursor_fini(&zc);
+
+	/*
+	 * Restore the seen state to before when we reinitialised the saved
+	 * cursor.
+	 */
+	seen = orig_seen;
+
+	/*
+	 * Do it all again a second time. This is making sure that the saved
+	 * cursor is usable even after the its been "used".
+	 */
+	unit_ok(zap_cursor_init_serialized_by_dnode(&zc, dn, cookie));
+	for (int i = 0; i < 13; i++) {
+		unit_ok(zap_cursor_retrieve(&zc, za));
+
+		char c = za->za_name[0];
+		unit_true(c >= 'a' && c <= 'z');
+
+		seen |= 1 << (c-'a');
+
+		zap_cursor_advance(&zc);
+	}
+
+	unit_err(zap_cursor_retrieve(&zc, za), ENOENT);
+	unit_eq(seen, (1 << 26) - 1);
+
+	zap_attribute_free(za);
+	zap_cursor_fini(&zc);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
 /* Test suite definition and boilerplate. */
 
 #define	UNIT_PARAM_ZAP_TYPES(p)	\
@@ -546,6 +756,9 @@ static const MunitTest zap_tests[] = {
 	UNIT_TEST("microzap_stats",		test_microzap_stats),
 	UNIT_TEST("fatzap_stats",		test_fatzap_stats),
 
+	UNIT_TEST_ZAP_TYPES("cursor",		test_cursor),
+	UNIT_TEST_ZAP_TYPES("cursor_serialize",	test_cursor_serialize),
+
 	{ 0 },
 };
 

From bfe4a8bb9d3334bbf5fcbc470e1663e8b8c209b5 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 29 May 2026 15:57:01 +1000
Subject: [PATCH 099/129] unit/zap: test that cursors correctly release all
 dnode holds

Cursors defer taking holds until they're needed, so if a cursor is
created but not used, it may still hold resources that it would have
cleaned up along the way, but never got chance to.

(this really happened in the first version of
zap_cursor_init_by_dnode(), so not a contrived case!)

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
---
 tests/unit/test_zap.c | 116 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index a08a899b794..276b6127e3f 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -725,6 +725,113 @@ test_cursor_serialize(const MunitParameter params[], void *data)
 	return (MUNIT_OK);
 }
 
+/*
+ * The following tests confirm that the cursor is properly cleaning up dnode
+ * holds taken (or not) across the lifetime of the cursor. The test is not
+ * about how or when it takes holds, only that the dnode refcount is the
+ * same before zap_cursor_init() as after zap_cursor_fini().
+ */
+static MunitResult
+test_cursor_release_unused(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+
+	uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn);
+
+	zap_cursor_t zc;
+	unit_ok(zap_cursor_init_by_dnode(&zc, dn));
+	zap_cursor_fini(&zc);
+
+	unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn));
+
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+static MunitResult
+test_cursor_release_advance(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+
+	uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn);
+
+	zap_cursor_t zc;
+	unit_ok(zap_cursor_init_by_dnode(&zc, dn));
+	zap_cursor_advance(&zc);
+	zap_cursor_fini(&zc);
+
+	unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn));
+
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+static MunitResult
+test_cursor_release_empty(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+
+	uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn);
+
+	zap_cursor_t zc;
+	zap_attribute_t *za = zap_attribute_alloc();
+
+	unit_ok(zap_cursor_init_by_dnode(&zc, dn));
+	unit_err(zap_cursor_retrieve(&zc, za), ENOENT);
+
+	zap_attribute_free(za);
+	zap_cursor_fini(&zc);
+
+	unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn));
+
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+static MunitResult
+test_cursor_release_one(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	uint64_t v = 1;
+	unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx));
+	unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx));
+
+	uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn);
+
+	zap_cursor_t zc;
+	zap_attribute_t *za = zap_attribute_alloc();
+
+	unit_ok(zap_cursor_init_by_dnode(&zc, dn));
+	unit_ok(zap_cursor_retrieve(&zc, za));
+
+	zap_attribute_free(za);
+	zap_cursor_fini(&zc);
+
+	unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn));
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
 /* ========== */
 
 /* Test suite definition and boilerplate. */
@@ -759,6 +866,15 @@ static const MunitTest zap_tests[] = {
 	UNIT_TEST_ZAP_TYPES("cursor",		test_cursor),
 	UNIT_TEST_ZAP_TYPES("cursor_serialize",	test_cursor_serialize),
 
+	UNIT_TEST_ZAP_TYPES(
+	    "cursor_release_unused",	test_cursor_release_unused),
+	UNIT_TEST_ZAP_TYPES(
+	    "cursor_release_advance",	test_cursor_release_advance),
+	UNIT_TEST_ZAP_TYPES(
+	    "cursor_release_empty",	test_cursor_release_empty),
+	UNIT_TEST_ZAP_TYPES(
+	    "cursor_release_one",	test_cursor_release_one),
+
 	{ 0 },
 };
 

From ef6f261454f82e9137599905a02512ae5640ac58 Mon Sep 17 00:00:00 2001
From: tiehexue <tiehexue@hotmail.com>
Date: Wed, 3 Jun 2026 03:31:31 +0800
Subject: [PATCH 100/129] When reading a vdev label skip libzfs_core_init()

There's no need to call libzfs_core_init() when `zdb -l` is used to
read a vdev label.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: tiehexue <tiehexue@hotmail.com>
Closes #18606
---
 cmd/zdb/zdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 1dcd70f628b..05e005d929a 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -10230,7 +10230,7 @@ main(int argc, char **argv)
 	 * Automate cachefile
 	 */
 	if (!spa_config_path_env && !config_path_console && target &&
-	    libzfs_core_init() == 0) {
+	    !dump_opt['l'] && libzfs_core_init() == 0) {
 		char *pname = strdup(target);
 		const char *value;
 		nvlist_t *pnvl = NULL;

From 80fb85b80b5db58914a76bbb7a24ada230a98955 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 15 May 2026 01:11:24 +0000
Subject: [PATCH 101/129] Fix the integer type in zfs_ioc_userspace_many()

Fix the mismatched type in zfs_ioc_userspace_many() and limit the
number of entries returned to 1000.  When a size larger than this
is requested the response is truncated, zfs_userspace() already
correctly handles short responses.  Historically, zfs_userspace()
has requested 100 entries at a time, this cap allows for 10x larger
batch sizes if needed in the future.

Reported-by: Yuxiang Yang, Yizhou Zhao, Ao Wang, Xuewei Feng, Qi Li,
Reported-by: and Ke Xu from Tsinghua University using GLM-5.1 from Z.ai
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18615
---
 module/zfs/zfs_ioctl.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 7013fbfb64f..414ea6bad3c 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -6652,21 +6652,27 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
  * outputs:
  * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
  * zc_cookie	zap cursor
+ *
+ * The zc_nvlist_dst output array is limited to 1000 entries.
  */
 static int
 zfs_ioc_userspace_many(zfs_cmd_t *zc)
 {
+	const size_t batch_limit = 1000 * sizeof (zfs_useracct_t);
+	uint64_t bufsize = MIN(zc->zc_nvlist_dst_size, batch_limit);
 	zfsvfs_t *zfsvfs;
-	int bufsize = zc->zc_nvlist_dst_size;
 
-	if (bufsize <= 0)
+	if (bufsize < sizeof (zfs_useracct_t)) {
+		zc->zc_nvlist_dst_size = sizeof (zfs_useracct_t);
 		return (SET_ERROR(ENOMEM));
+	}
 
 	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	void *buf = vmem_alloc(bufsize, KM_SLEEP);
+	zc->zc_nvlist_dst_size = bufsize;
 
 	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
 	    buf, &zc->zc_nvlist_dst_size, &zc->zc_guid);

From 0aa4088dce0e7b2425e90c195ef4699d0b65ae46 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Tue, 2 Jun 2026 16:34:51 -0700
Subject: [PATCH 102/129] sharenfs: Check for invalid characters

Check for invalid characters in sharenfs/sharesmb dataset props.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18613
---
 lib/libzfs/libzfs_changelist.c | 15 ++++++++++++---
 lib/libzfs/libzfs_share.c      |  8 ++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c
index eac06f8f5ab..b1a2e17cb7a 100644
--- a/lib/libzfs/libzfs_changelist.c
+++ b/lib/libzfs/libzfs_changelist.c
@@ -177,6 +177,7 @@ changelist_postfix(prop_changelist_t *clp)
 	char shareopts[ZFS_MAXPROPLEN];
 	boolean_t commit_smb_shares = B_FALSE;
 	boolean_t commit_nfs_shares = B_FALSE;
+	int rc = 0;
 
 	/*
 	 * If CL_GATHER_DONT_UNMOUNT is set, it means we don't want to (un)mount
@@ -266,7 +267,7 @@ changelist_postfix(prop_changelist_t *clp)
 		const enum sa_protocol nfs[] =
 		    {SA_PROTOCOL_NFS, SA_NO_PROTOCOL};
 		if (sharenfs && mounted) {
-			zfs_share(cn->cn_handle, nfs);
+			rc = zfs_share(cn->cn_handle, nfs);
 			commit_nfs_shares = B_TRUE;
 		} else if (cn->cn_shared || clp->cl_waslegacy) {
 			zfs_unshare(cn->cn_handle, NULL, nfs);
@@ -275,7 +276,7 @@ changelist_postfix(prop_changelist_t *clp)
 		const enum sa_protocol smb[] =
 		    {SA_PROTOCOL_SMB, SA_NO_PROTOCOL};
 		if (sharesmb && mounted) {
-			zfs_share(cn->cn_handle, smb);
+			rc = zfs_share(cn->cn_handle, smb);
 			commit_smb_shares = B_TRUE;
 		} else if (cn->cn_shared || clp->cl_waslegacy) {
 			zfs_unshare(cn->cn_handle, NULL, smb);
@@ -291,7 +292,15 @@ changelist_postfix(prop_changelist_t *clp)
 	*p++ = SA_NO_PROTOCOL;
 	zfs_commit_shares(proto);
 
-	return (0);
+	/*
+	 * It's possible rc != 0 since we set a mountpoint or option while
+	 * SMB/NFS was not running.  This is fine, and we should not return
+	 * an error up the stack.
+	 *
+	 * At this point we only want to report mountpoint/shareops parsing
+	 * errors.
+	 */
+	return (rc == SA_SYNTAX_ERR ? rc : 0);
 }
 
 /*
diff --git a/lib/libzfs/libzfs_share.c b/lib/libzfs/libzfs_share.c
index bfac40f17de..98a09f7f331 100644
--- a/lib/libzfs/libzfs_share.c
+++ b/lib/libzfs/libzfs_share.c
@@ -64,6 +64,10 @@ sa_enable_share(const char *zfsname, const char *mountpoint,
 {
 	VALIDATE_PROTOCOL(protocol, SA_INVALID_PROTOCOL);
 
+	int error = sa_validate_shareopts(shareopts, protocol);
+	if (error != SA_OK)
+		return (error);
+
 	const struct sa_share_impl args =
 	    init_share(zfsname, mountpoint, shareopts);
 	return (fstypes[protocol]->enable_share(&args));
@@ -111,6 +115,10 @@ sa_validate_shareopts(const char *options, enum sa_protocol protocol)
 {
 	VALIDATE_PROTOCOL(protocol, SA_INVALID_PROTOCOL);
 
+	/* error out on invalid characters */
+	if (strpbrk(options, "\a\b\f\n\r") != NULL)
+		return (SA_SYNTAX_ERR);
+
 	return (fstypes[protocol]->validate_shareopts(options));
 }
 

From e199f6d98b40e37e79aad51f2fad7f1490968ede Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 2 Jun 2026 16:36:38 -0700
Subject: [PATCH 103/129] Fix uninitialized variable warning in vdev_prop_get()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update vdev_prop_get_objid() to set objid on error as the comment
in vdev_prop_get() describes.

    "objid is set to 0 when absent and the few cases that call
    zap_lookup directly guard against this below."

This resolves the following possible uninitialized variable warning.

    module/zfs/vdev.c: In function ‘vdev_prop_get’:
    module/zfs/vdev.c:6913:12: error: ‘objid’ may be used uninitialized
    in this function [-Werror=maybe-uninitialized]

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18616
---
 module/zfs/vdev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 91cd9c6dc84..211adae0968 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -460,6 +460,7 @@ vdev_prop_get_objid(vdev_t *vd, uint64_t *objid)
 	} else if (vd->vdev_leaf_zap != 0) {
 		*objid = vd->vdev_leaf_zap;
 	} else {
+		*objid = 0;
 		return (EINVAL);
 	}
 
@@ -6444,7 +6445,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = 0;
-	uint64_t objid;
+	uint64_t objid = 0;
 	uint64_t vdev_guid;
 	nvpair_t *elem = NULL;
 	nvlist_t *nvprops = NULL;

From 7de42602cac5aada0af0f9af571c485f74333731 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 3 Jun 2026 14:45:21 -0700
Subject: [PATCH 104/129] Extend dataset zfs_ioc_set_prop() secpolicy

When zc->zc_cookie is set this indicates to zfs_ioc_set_prop() that
these are received properties and ZPROP_HAS_RECVD will be set on the
dataset.  This is only done as part of a `zfs receive` so additionally
apply the zfs_secpolicy_recv() policy.  Individual property checks
continue to be handled by zfs_check_settable().

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18617
---
 module/zfs/zfs_ioctl.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 414ea6bad3c..aeefab4fa64 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1088,6 +1088,23 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 	    ZFS_DELEG_PERM_CREATE, cr));
 }
 
+/*
+ * Policy for dataset set property operations.  Individual properties checked by
+ * zfs_check_settable(), additionally require zfs_secpolicy_recv() when setting
+ * properties as part of a receive.
+ */
+static int
+zfs_secpolicy_setprops(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	boolean_t received = zc->zc_cookie;
+	int error;
+
+	if (received && (error = zfs_secpolicy_recv(zc, innvl, cr)))
+		return (error);
+
+	return (zfs_secpolicy_read(zc, innvl, cr));
+}
+
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
@@ -8056,7 +8073,7 @@ zfs_ioctl_init(void)
 	    zfs_ioc_send, zfs_secpolicy_send);
 
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
-	    zfs_secpolicy_none);
+	    zfs_secpolicy_setprops);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
 	    zfs_secpolicy_destroy);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,

From 4eb0851d71280969cb041f1a0fcb1ae32c45c4ea Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Thu, 4 Jun 2026 02:33:19 +0200
Subject: [PATCH 105/129] ZTS: fix mktemp usage on FreeBSD

On FreeBSD -t takes a required prefix argument. Use
"mktemp -d $TEST_BASE_DIR/..." instead.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18621
---
 tests/test-runner/bin/zts-report.py.in                          | 2 --
 tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh  | 2 +-
 tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh  | 2 +-
 .../zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 2cbd2f02a31..a80112d914e 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -256,7 +256,6 @@ maybe = {
     'send_xdr_encoding/xdr_bookmark_raw_with_write': ['FAIL', 18491],
     'send_xdr_encoding/xdr_resume_bookmark_raw_with_write': ['FAIL', 18491],
     'snapshot/clone_001_pos': ['FAIL', known_reason],
-    'snapshot/snapshot_006_pos': ['FAIL', known_reason],
     'snapshot/snapshot_009_pos': ['FAIL', 7961],
     'snapshot/snapshot_010_pos': ['FAIL', 7961],
     'snapused/snapused_004_pos': ['FAIL', 5513],
@@ -279,7 +278,6 @@ if sys.platform.startswith('freebsd'):
         'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622],
         'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
         'resilver/resilver_restart_001': ['FAIL', known_reason],
-        'snapshot/snapshot_002_pos': ['FAIL', 14831],
         'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', 16668],
         'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_corner_cases_limited':
diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh
index ffc4e96f5a0..0f3b1a84d83 100755
--- a/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh
@@ -64,7 +64,7 @@ function cleanup
 log_assert "Verify an archive of a file system is identical to " \
     "an archive of its snapshot."
 
-SNAPSHOT_TARDIR="$(mktemp -t -d zfstests_snapshot_002.XXXXXX)"
+SNAPSHOT_TARDIR="$(mktemp -d "$TEST_BASE_DIR/zfstests_snapshot_002.XXXXXX")"
 log_onexit cleanup
 
 typeset -i COUNT=21
diff --git a/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh b/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh
index 20d53eb5012..db8d820bdd1 100755
--- a/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh
+++ b/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh
@@ -73,7 +73,7 @@ function cleanup
 log_assert "Verify that an archive of a dataset is identical to " \
    "an archive of the dataset's snapshot."
 
-SNAPSHOT_TARDIR="$(mktemp -t -d zfstests_snapshot_006.XXXXXX)"
+SNAPSHOT_TARDIR="$(mktemp -d "$TEST_BASE_DIR/zfstests_snapshot_006.XXXXXX")"
 log_onexit cleanup
 
 typeset -i COUNT=21
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
index 9047f14bc81..0f18f2e5733 100755
--- a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh
@@ -58,7 +58,7 @@ biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL)
 typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \
 	$num_zvols )))
 
-typeset tmpdir="$(mktemp -t -d zvol_stress_fio_state.XXXXXX)"
+typeset tmpdir="$(mktemp -d "$TEST_BASE_DIR/zvol_stress_fio_state.XXXXXX")"
 
 log_must save_tunable VOL_USE_BLK_MQ
 log_must save_tunable VOL_REQUEST_SYNC

From 4256f4f8e09e786d4bf7af2a1cecff1394c0b3b3 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 4 Jun 2026 09:24:06 -0700
Subject: [PATCH 106/129] pam: use open fd instead of path

Instead of performing multiple operations on the path name in
zfs_key_config_modify_session_counter() open the file once and
perform the fchown, fchmod, and openat on the open file handle.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18618
---
 contrib/pam_zfs_key/pam_zfs_key.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c
index d5513b7a43f..5477c7dc611 100644
--- a/contrib/pam_zfs_key/pam_zfs_key.c
+++ b/contrib/pam_zfs_key/pam_zfs_key.c
@@ -840,27 +840,41 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh,
 		    errno);
 		return (-1);
 	}
-	if (chown(runtime_path, 0, 0) != 0) {
-		pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d",
-		    errno);
+	const int runtime_fd = open(runtime_path,
+	    O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_DIRECTORY);
+	if (runtime_fd < 0) {
+		pam_syslog(pamh, LOG_ERR, "Can't open runtime path: %d", errno);
 		return (-1);
 	}
-	if (chmod(runtime_path, S_IRWXU) != 0) {
+	if (fchown(runtime_fd, 0, 0) != 0) {
+		pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d",
+		    errno);
+		close(runtime_fd);
+		return (-1);
+	}
+	if (fchmod(runtime_fd, S_IRWXU) != 0) {
 		pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d",
 		    errno);
+		close(runtime_fd);
 		return (-1);
 	}
 
 	char *counter_path;
-	if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1)
+	if (asprintf(&counter_path, "%u", config->uid) == -1) {
+		close(runtime_fd);
 		return (-1);
+	}
 
-	const int fd = open(counter_path,
+	const int fd = openat(runtime_fd, counter_path,
 	    O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW,
 	    S_IRUSR | S_IWUSR);
+	int ret = errno;
+
 	free(counter_path);
+	close(runtime_fd);
+
 	if (fd < 0) {
-		pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno);
+		pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", ret);
 		return (-1);
 	}
 	if (flock(fd, LOCK_EX) != 0) {
@@ -871,7 +885,6 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh,
 	char counter[20];
 	char *pos = counter;
 	int remaining = sizeof (counter) - 1;
-	int ret;
 	counter[sizeof (counter) - 1] = 0;
 	while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) {
 		remaining -= ret;

From 9f23793d538c9dc7f7987336fd1c5fc0b2a77c05 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 3 Jun 2026 12:06:22 +1000
Subject: [PATCH 107/129] coverage_report: produce nice text reports from
 lcov/geninfo tracefiles

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18619
---
 scripts/Makefile.am        |   1 +
 scripts/coverage_report.pl | 392 +++++++++++++++++++++++++++++++++++++
 2 files changed, 393 insertions(+)
 create mode 100755 scripts/coverage_report.pl

diff --git a/scripts/Makefile.am b/scripts/Makefile.am
index df2fae42fce..ed18a81b375 100644
--- a/scripts/Makefile.am
+++ b/scripts/Makefile.am
@@ -28,6 +28,7 @@ dist_noinst_SCRIPTS += $(scripts_scripts)
 endif
 
 dist_noinst_DATA += \
+	%D%/coverage_report.pl \
 	%D%/cstyle.pl \
 	%D%/update_authors.pl
 
diff --git a/scripts/coverage_report.pl b/scripts/coverage_report.pl
new file mode 100755
index 00000000000..ba8dec7a8d9
--- /dev/null
+++ b/scripts/coverage_report.pl
@@ -0,0 +1,392 @@
+#!/usr/bin/env perl
+
+# SPDX-License-Identifier: MIT
+#
+# Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+# Copyright (c) 2026, TrueNAS.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+#
+# usage: coverage_report.pl tests/unit/test_zap.info
+#        coverage_report.pl < tests/unit/test_zap.info
+#
+# This program takes an lcov/geninfo coverage tracefile and shows a summary
+# of line, branch and function coverage for each file. It's focused on the
+# specific needs of OpenZFS' unit test suite (see tests/unit/README.md) but
+# it should be adaptable to any place where lcov's HTML output is too heavy
+# or difficult to use (eg build/CI logs).
+#
+# The heart of this program is a small parser for the tracefile format as
+# described in geninfo(1). The rest is concerned with constructing a useful
+# colorised table output.
+#
+
+#
+# Typical output:
+#
+# Coverage: test_zap       | By line         | By branch       | By function
+#                          | Rate% Total Hit | Rate% Total Hit | Rate% Total Hit
+# module/zfs/u8_textprep.c | 42.0%   802 337 | 33.5%   510 171 | 50.0%    12   6
+# module/zfs/zap.c         | 52.1%   687 358 | 45.2%   250 113 | 41.1%    90  37
+# module/zfs/zap_fat.c     | 87.8%   665 584 | 58.5%   446 261 | 94.6%    37  35
+# module/zfs/zap_impl.c    | 81.9%   232 190 | 60.3%   146  88 | 92.0%    25  23
+# module/zfs/zap_leaf.c    | 86.7%   466 404 | 69.0%   216 149 | 95.7%    23  22
+# module/zfs/zap_micro.c   | 76.5%   238 182 | 54.2%   142  77 | 92.9%    14  13
+#
+
+use 5.010;
+use warnings;
+use strict;
+use Cwd qw(getcwd);
+use Term::ANSIColor qw(colored);
+
+# Setup for color output. Perl has included Term::ANSIColor since 5.6 (~2000),
+# but RGB support didn't arrive until v4 in 5.17.8 (~2012). We disable colors
+# outright on versions < 4, or if output is not attached to a terminal.
+my $use_colors = -t \*STDOUT && $Term::ANSIColor::VERSION >= 4;
+
+# Palette setup. If Term::ANSIColor and the terminal advertise support for
+# it, then we set up a pleasant red -> green gradient for the coverage
+# percentages. If not, we scale those colors down to the older RGB-240 colors
+# (0-5 for each component), which is still quite nice.
+my @palette = !$use_colors ? () : map {
+	state $has_truecolor =
+	    $Term::ANSIColor::VERSION >= 5 && $ENV{COLORTERM};
+	my @rgb = map { hex } m/../g;
+	if ($has_truecolor) {
+		sprintf 'r%dg%db%d', @rgb;
+	} else {
+		sprintf 'rgb%d%d%d', map { $_ * 6 / 255 } @rgb;
+	}
+} (
+	# Catppuccin Latte
+	# https://catppuccin.com/palette/
+	'd20f39',	# Red
+	'e64553',	# Maroon
+	'fe640b',	# Peach
+	'df8e1d',	# Yellow
+	'40a02b',	# Green
+	'179299',	# Teal
+);
+
+# Test name, from the TN: field if present.
+my $test_name = '';
+
+# Per-file data, initially sourced from the tracefile, then augmented
+my %filedata;
+
+# Tracking for the longest (stringified) value for each key. These are used
+# later when computing the output table column width.
+my %len;
+sub bump_len {
+	my ($k, $x) = @_;
+	my $l = length "".$x;
+	$len{$k} = $l if ($len{$k} // 0) < $l;
+}
+
+###
+# Parse the tracefile into per-file data records.
+
+# Current working directory. Expected to be the build root. Used to remove
+# the leading part of the source filenames, so its not the end of the world
+# if its wrong.
+my $cwd = getcwd;
+
+# Loop over the input
+while (my $line = <>) {
+	state $data = {};
+	chomp $line;
+
+	# skip comments
+	next if $line =~ m/^#/;
+
+	if ($line eq 'end_of_record') {
+		# end of this file, prep for next
+		$data = {};
+		next;
+	}
+
+	# everything else should be a KEY:VALUE line
+	my ($k, $v) = $line =~ m/^([A-Z]+):(.*)$/;
+	unless (defined $k) {
+		say "W: $.: malformed line: $line";
+		next;
+	}
+
+	if ($k eq 'TN') {
+		# TN:test_zap
+
+		# Test name. This is actually per-record (a tracefile can
+		# carry multiple test results) but we only ever generate
+		# them for a single test, so we don't make any effort to
+		# notice or track changes.
+		$test_name = $v;
+		next;
+	}
+
+	if ($k eq 'SF') {
+		# SF:/home/robn/code/zfs-unit/module/zfs/zap.c
+
+		# Source file. Value is the name, and the rest of the record
+		# apply to it.
+
+		# Remove the leading build root name.
+		my $path = $v;
+		$path =~ s{^$cwd/*}{};
+
+		# If we haven't seen this file before, create a new data
+		# record for it.
+		$filedata{$v} //= { path => $path };
+		$data = $filedata{$v};
+
+		# Increase path column width if necessary.
+		bump_len('path', $path);
+		next;
+	}
+
+	# Handle the counter keys. These are single values for the entire
+	# record in the file. L, FN and BR are Line, Function and Branch,
+	# F and H are found (ie total) and hit (ie was executed).
+	if (grep { $_ eq $k } qw(LF LH FNF FNH BRF BRH)) {
+		$data->{lc $k} = $v;
+		bump_len(lc $k, $v);
+		next;
+	}
+
+	# Older versions of lcov may not emit absolute found/hit counters. To
+	# handle this, we maintain our own counters from other events recorded
+	# in the info file, which we use if we don't get an absolute count.
+
+	if ($k eq 'DA') {
+		# DA:<line number>,<execution count>[,<checksum>]
+		# DA:463,0
+		# DA:469,153
+		my ($l, $h) = split ',', $v;
+
+		# One DA: record per actual code line (vs comment or other
+		# non-executable line), so we count records, not line number.
+		$data->{_lf}++;
+
+		# Only increment the hit count if the line was executed.
+		$data->{_lh}++ if $h > 0;
+		next;
+	}
+
+	if ($k eq 'FN') {
+		# FN:<start line>,[<end line>,]<function nname>
+		# FN:283,zap_lookup_by_dnode
+
+		# One FN record per function
+		$data->{_fnf}++;
+		next;
+	}
+	if ($k eq 'FNDA') {
+		# FNDA:<execution count>,<function name>
+		# FNDA:0,zap_lookup
+		# FNDA:78,zap_lookup_by_dnode
+
+		# Only count hit if more than one execution.
+		my ($c) = split ',', $v;
+		$data->{_fnh}++ if 0+$c > 0;
+		next;
+	}
+
+	if ($k eq 'BRDA') {
+		# BRDA:<line_number>,[<exception>]<block>,<branch>,<taken>
+		# BRDA:365,0,0,-
+		# BRDA:365,0,1,-
+		my ($l, $b, $br, $c) = split ',', $v;
+
+		# One BRDA: record per branch
+		$data->{_brf}++;
+
+		# <taken> is number of times branch arm was taken, or '-' if
+		# never considered (eg surrounding block was never entered)
+		# they're both 0 for our purposes.
+		$c = 0 if $c eq '-';
+
+		# Only count hit if more than one execution.
+		$data->{_brh}++ if 0+$c > 0;
+		next;
+	}
+}
+
+###
+# Synthesize missing counters
+
+for my $file (keys %filedata) {
+	my $data = $filedata{$file};
+
+	for my $k (qw(lf lh fnf fnh brf brh)) {
+		# Get our own count, if one exists.
+		my $v = delete $data->{"_$k"} // 0;
+
+		# If we didn't find a count in the info file, use our own.
+		# Note that this will also set legitimately unseen values to
+		# 0 (eg a source file with no branches). That's actually what
+		# we want.
+		unless (exists $data->{$k}) {
+			$data->{$k} = $v;
+			bump_len($k, $v);
+		}
+	}
+}
+
+###
+# Synthesize the "rate" percentage field from the "found" and "hit" fields.
+
+sub rate {
+	my ($data, $k, $kf, $kh) = @_;
+	my $rate = sprintf '%.01f%%',
+	    $data->{$kf} ? (100 * $data->{$kh} / $data->{$kf}) : 0;
+	$data->{$k} = $rate;
+	bump_len($k, $rate);
+}
+
+for my $file (keys %filedata) {
+	my $data = $filedata{$file};
+	rate($data, 'lr', 'lf', 'lh');
+	rate($data, 'brr', 'brf', 'brh');
+	rate($data, 'fnr', 'fnf', 'fnh');
+}
+
+###
+# Set up the header "rows".
+
+# We reuse our data record structure a little because outputting these needs to
+# consider and sometimes contribute to column width.
+
+# The top row spans multiple columns. The pad functions below have extra tools
+# to handle the math.
+my $h1data = {
+	path => 'Coverage'.($test_name ? ": $test_name" : ''),
+	l => 'By line',
+	br => 'By branch',
+	fn => 'By function',
+};
+bump_len('path', $h1data->{path});
+
+# The second row is the actual header for each data column, and so may push
+# the column widths out if necessary.
+my $h2data = {
+	lr  => 'Rate%', lf  => 'Total', lh  => 'Hit',
+	brr => 'Rate%', brf => 'Total', brh => 'Hit',
+	fnr => 'Rate%', fnf => 'Total', fnh => 'Hit',
+};
+bump_len($_, $h2data->{$_}) for keys %$h2data;
+
+###
+# Table layout
+
+# Internal helper for padr() and padl() below. The idea is to compute the
+# effective column width, and the string we want to place in it. If it would
+# fit exactly, we return the string. If not, the passed-in function is called
+# with the string, its length and the column width, and it will place it
+# (by adding padding on either side).
+#
+# Most calls take a single column key, which makes it very simple - take
+# the max width for that column (from %len, set by bump_len()), and the value
+# of that key in this column, and that's all of it.
+#
+# For the top heading row (h1data above), a list of column keys can be passed
+# in. In this case, the string will be constructed as a space-separated list
+# of all the keys have have a value in the data row. The column width is the
+# sum of max column widths for all columns that mave a max column width, plus
+# one for each space separator. This allows us to provide a separate string
+# to appear in the space, with the amount of space computed from the columns
+# underneath it.
+#
+sub _pad {
+	my ($fn, $data, @k) = @_;
+	my $str = join ' ', map { $data->{$_} // () } @k;
+	my $strlen = length $str;
+	my $colwidth = -1;
+	$colwidth += ($len{$_} // -1)+1 for @k;
+	return $strlen == $colwidth ? $str : $fn->($str, $strlen, $colwidth);
+}
+
+# Return the value of the named fields, with space-padding added to the right.
+sub padr {
+	_pad(sub {
+		my ($str, $strlen, $colwidth) = @_;
+		$str . (' ' x ($colwidth - $strlen));
+	}, @_);
+}
+
+# Return the value of the named fields, with space-padding added to the left.
+sub padl {
+	_pad(sub {
+		my ($str, $strlen, $colwidth) = @_;
+		(' ' x ($colwidth - $strlen)) . $str;
+	}, @_);
+}
+
+# Return the given % string, wrapped in terminal control codes that will give
+# it an appropriate color from the palette.
+sub colorpct {
+	my ($pct) = @_;
+
+	# If colors are disabled, return the string as-is.
+	return $pct unless $use_colors;
+
+	my ($n) = $pct =~ m/([0-9\.]+)/;
+
+	# scale 0-100 into palette range
+	my $s = int(($#palette / 100) * $n);
+	my $c = $palette[$s];
+
+	return colored([$c], $pct);
+}
+
+my @rows;
+
+# Layout the first header row
+push @rows, [
+	padr($h1data, 'path'),
+	'|', padr($h1data, 'l', 'lr', 'lf', 'lh'),
+	'|', padr($h1data, 'br', 'brr', 'brf', 'brh'),
+	'|', padr($h1data, 'fn', 'fnr', 'fnf', 'fnh'),
+];
+
+# Layout the second header row
+push @rows, [
+	padr($h2data, 'path'),
+	'|', padr($h2data, 'lr'), padl($h2data, 'lf'), padl($h2data, 'lh'),
+	'|', padr($h2data, 'brr'), padl($h2data, 'brf'), padl($h2data, 'brh'),
+	'|', padr($h2data, 'fnr'), padl($h2data, 'fnf'), padl($h2data, 'fnh'),
+];
+
+# Layout the data rows, padding colorising as appropriate.
+for my $file (sort keys %filedata) {
+	my $data = $filedata{$file};
+
+	push @rows, [
+	    padr($data, 'path'),
+	    '|', colorpct(padl($data, 'lr')),
+	    padl($data, 'lf'), padl($data, 'lh'),
+	    '|', colorpct(padl($data, 'brr')),
+	    padl($data, 'brf'), padl($data, 'brh'),
+	    '|', colorpct(padl($data, 'fnr')),
+	    padl($data, 'fnf'), padl($data, 'fnh'),
+	];
+}
+
+# And print them all out!
+say "@$_" for @rows;

From 9a6dd0e1bc541612c0325101b5c63feffca3d2cf Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 3 Jun 2026 15:12:42 +1000
Subject: [PATCH 108/129] unit: support text & HTML targets; improve coverage
 rules

The main change is switching `unit-coverage` to run
scripts/coverage_report.pl, to get nice coverage summary output on the
commandline. The previous behaviour moves to `unit-coverage-html`.

Calls to lcov and genhtml are now silencing more warnings, and the
output file now gets branch coverage as well.

This should be compatible with both lcov 1.x and 2.x. It takes advantage
of the fact that 1.x is far more forgiving of both options it doesn't
understand, and of various kinds of "inconsistency" in the input data.

The rest is both simplifying and improving the rules. We keep the
coverage output around now, but still rebuild it if the binary changes.
The `clean` target now removes the coverage output too. And we use the
target name more often for building path names, as its far less noisy.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18619
---
 tests/unit/Makefile.am | 54 +++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index cb5bfc10013..80fe7311c46 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -39,27 +39,50 @@ nodist_%C%_test_zap_SOURCES = \
 
 
 # test run and coverage targets below
-PHONY += unit unit-coverage
+PHONY += unit unit-coverage unit-coverage-html
 
 _unit_run_%: %D%/%
 	@echo "  UNITTEST $<" ; $< $(TOPT)
 
-_unit_coverage_%: %D%/%
-	@${LCOV} --quiet --zerocounters --directory $(top_srcdir) >/dev/null
+# note: any changes in switches to lcov or genhtml must be carefully checked
+#       on 1.x and 2.x; the current option set is carefully chosen to allow
+#       both to work sensibly
+
+# .info is marked PRECIOUS, because its usually only created as an intermediate
+# from one of the unit phony targets, but once it exists there's no point
+# remaking it until and unless the test binary itself changes
+.PRECIOUS: %D%/%.info
+%D%/%.info: %D%/%
+	@-${RM} $@
+	@${LCOV} --quiet --quiet --zerocounters --directory $(top_srcdir)
 	@echo "  UNITTEST $<" ; $< $(TOPT)
-	@${LCOV} --quiet --capture  \
-		--test-name $(subst _unit_coverage_, , $@) \
+	@${LCOV} --quiet --quiet --capture  \
+		--test-name $(notdir $<) \
 		--directory $(top_srcdir) \
-		--output-file \
-			%D%/$(join $(subst _unit_coverage_, , $@), .info) \
-		$(addprefix --include , $(call $(join \
-			$(subst _unit_coverage_, nodist_%C%_, $@), _SOURCES)))
-	@${GENHTML} --quiet \
-		%D%/$(join $(subst _unit_coverage_, , $@), .info) \
-		--output-directory \
-			%D%/$(join $(subst _unit_coverage_, , $@), _coverage)
+		--output-file $@ \
+		--rc lcov_branch_coverage=1 \
+		--rc geninfo_unexecuted_blocks=1 \
+		$(addprefix --include $(abs_top_builddir)/, $(call \
+		    $(join $(join nodist_%C%_, $(notdir $<)), _SOURCES))) \
+		2>/dev/null
+
+_unit_coverage_%: %D%/%.info
+	@scripts/coverage_report.pl $<
+
+_unit_coverage_html_%: %D%/%.info
+	@-${RM} -r $(subst .info,_coverage, $<)
+	@${GENHTML} --quiet -quiet \
+		--rc lcov_branch_coverage=1 \
+		--rc check_data_consistency=0 \
+		--output-directory $(subst .info,_coverage, $<) \
+		$< \
+		2>/dev/null
 	@echo "coverage results:" \
-		"file://$(realpath %D%)/$(join $(subst _unit_coverage_, , $@), _coverage)/index.html"
+		"file://$(realpath %D%)/$(subst .info,_coverage,$(notdir $<))/index.html"
+
+CLEAN_LOCAL += unit-clean-local
+unit-clean-local:
+	-${RM} -r %D%/*.info %D%/*_coverage/
 
 _UNIT_ALL_TARGETS = $(notdir $(UNIT_TESTS))
 _UNIT_FIND_TARGET = \
@@ -76,9 +99,12 @@ unit: $(addprefix _unit_run_, $(_UNIT_TARGETS))
 if CODE_COVERAGE_ENABLED
 unit-coverage: $(addprefix _unit_coverage_, $(_UNIT_TARGETS))
 	@$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false)
+unit-coverage-html: $(addprefix _unit_coverage_html_, $(_UNIT_TARGETS))
+	@$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false)
 else
 unit-coverage:
 	@echo "unit test coverage not enabled."
 	@echo "re-run configure with --enable-code-coverage"
 	@false
+unit-coverage-html: unit-coverage
 endif

From e03375947c54de0b07597a543922d734e3f6fb57 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 3 Jun 2026 16:47:04 +1000
Subject: [PATCH 109/129] unit: update docs for new coverage report options

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18619
---
 tests/unit/README.md | 74 +++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/tests/unit/README.md b/tests/unit/README.md
index a7096067529..6a4ee095af2 100644
--- a/tests/unit/README.md
+++ b/tests/unit/README.md
@@ -62,46 +62,70 @@ assist with understanding issues.
 make -j$(nproc)
 ```
 
-TODO: add `--with-config=unit` that disables _everything_ not needed for the tests
+TODO: add `--with-config=unit` that disables _everything_ not needed for the
+tests
 
 ### Generating a coverage report
 
-If `configure` was run with `--enable-code-coverage`, then an additional
-`unit-coverage` target is available, which will run the requested tests, then
-run `lcov` and `genhtml` to produce a HTML coverage report:
+If `configure` was run with `--enable-code-coverage`, then two additional build
+targets are available that will run the requested tests and produce a report.
+
+The `unit-coverage` target runs `scripts/coverage_report.pl` to produce a
+coverage summary directly in text immediately after the test output, and is
+good for inclusion in log files and other build system output.
 
 ```
 $ make unit-coverage T=zap
   UNITTEST tests/unit/test_zap
-Running test suite with seed 0xe461208d...
-zap.mock_microzap_sanity             [ OK    ] [ 0.00000933 / 0.00000773 CPU ]
-zap.mock_fatzap_sanity               [ OK    ] [ 0.00004685 / 0.00004612 CPU ]
-zap.zap_basic
-  type=micro                         [ OK    ] [ 0.00002579 / 0.00002484 CPU ]
-  type=fat                           [ OK    ] [ 0.00004093 / 0.00004038 CPU ]
-4 of 4 (100%) tests successful, 0 (0%) test skipped.
-lcov: WARNING: (inconsistent) /home/robn/code/zfs-unit/module/zfs/u8_textprep.c:1104: unexecuted block on non-branch line with non-zero hit count.  Use "geninfo --rc geninfo_unexecuted_blocks=1 to set count to zero.
-	(use "lcov --ignore-errors inconsistent,inconsistent ..." to suppress this warning)
-Message summary:
-  1 warning message:
-    inconsistent: 1
-Overall coverage rate:
-  source files: 6
-  lines.......: 42.3% (1270 of 3002 lines)
-  functions...: 42.0% (76 of 181 functions)
-Message summary:
-  no messages were reported
-coverage results: file://tests/unit/test_zap_coverage/index.html
+Running test suite with seed 0xf51efca9...
+zap.mock_microzap_sanity             [ OK    ] [ 0.00000941 / 0.00000834 CPU ]
+zap.mock_fatzap_sanity               [ OK    ] [ 0.00005782 / 0.00005766 CPU ]
+...
+zap.cursor_release_one
+  type=micro                         [ OK    ] [ 0.00001705 / 0.00001681 CPU ]
+  type=fat                           [ OK    ] [ 0.00004748 / 0.00004738 CPU ]
+30 of 30 (100%) tests successful, 0 (0%) test skipped.
+Coverage: test_zap       | By line         | By branch       | By function
+                         | Rate% Total Hit | Rate% Total Hit | Rate% Total Hit
+module/zfs/u8_textprep.c |  0.0%   802   0 |  0.0%   510   0 |  0.0%    12   0
+module/zfs/zap.c         | 33.9%   610 207 | 31.1%   238  74 | 23.0%    74  17
+module/zfs/zap_fat.c     | 47.1%   665 313 | 29.8%   446 133 | 62.2%    37  23
+module/zfs/zap_impl.c    | 57.8%   232 134 | 39.7%   146  58 | 72.0%    25  18
+module/zfs/zap_leaf.c    | 60.9%   466 284 | 41.2%   216  89 | 78.3%    23  18
+module/zfs/zap_micro.c   | 68.9%   238 164 | 41.5%   142  59 | 92.9%    14  13
 ```
 
-TODO: improve the overall structure to make this less noisy.
+The `unit-coverage-html` will use `lcov` and `genhtml` to generate an
+interactive HTML report that also can show the specific source lines that are
+covered.
+
+```
+$ make unit-coverage-html T=zap
+  UNITTEST tests/unit/test_zap
+Running test suite with seed 0x485bf2e2...
+zap.mock_microzap_sanity             [ OK    ] [ 0.00000935 / 0.00000794 CPU ]
+zap.mock_fatzap_sanity               [ OK    ] [ 0.00006050 / 0.00006025 CPU ]
+...
+zap.cursor_release_one
+  type=micro                         [ OK    ] [ 0.00001785 / 0.00001767 CPU ]
+  type=fat                           [ OK    ] [ 0.00005262 / 0.00005250 CPU ]
+30 of 30 (100%) tests successful, 0 (0%) test skipped.
+coverage results:
+file:///home/robn/code/zfs-unit/tests/unit/tests/unit/test_zap_coverage/index.ht
+ml
+```
+
+Currently the coverage data will only be regenerated when the test binary
+itself changes. To force it, use `make unit-clean-local` to remove the coverage
+data.
 
 ## Guidance for test writers
 
 ### Top five
 
 * Only bring in the source files under test.
-* Use mocks to create the test scenario, then interrogate them to understand the result.
+* Use mocks to create the test scenario, then interrogate them to understand
+the result.
 * Prefer more smaller tests over fewer bigger ones.
 * Use coverage reports to guide test development.
 * Do the simplest possible thing.

From 5fea0c838a1559f2e927fad7304d89c9969224ba Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Thu, 4 Jun 2026 16:25:40 -0400
Subject: [PATCH 110/129] Parallelize metaslab_sync_done() calls

Some of our random write benchmarks on a fragmented pool show that
single-threaded portion of sync process (txg_sync_thread) can use
up to 45% of CPU time.  Most of it is consumed by metaslab_sync()
and metaslab_sync_done(), during which time the pool is not doing
anything else.

While metaslab_sync() is not trivial to parallelize due to having
single spacemap log, metaslab_sync_done() is doing only per-metaslab
accounting and they can run in parallel.  Even better, we can run
them while waiting for vdev label update and cache flush I/Os.

With this patch on my test system similar test randomly writing 12
100GB files with 4KB blocks shows IOPS increase from 176K to 220K.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18622
---
 include/sys/vdev_impl.h |  1 +
 module/zfs/spa.c        |  7 ++++---
 module/zfs/vdev.c       | 30 ++++++++++++++++++++++++++----
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 3c19b9abe9c..84e78f5dbc8 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -592,6 +592,7 @@ extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern int vdev_load(vdev_t *vd);
 extern int vdev_dtl_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_dispatch(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
 extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index ec93ce97433..c6ae91b8d9e 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -11019,6 +11019,10 @@ spa_sync(spa_t *spa, uint64_t txg)
 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
 	}
 
+	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
+		vdev_sync_dispatch(vd, txg);
+
 	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);
 
@@ -11043,9 +11047,6 @@ spa_sync(spa_t *spa, uint64_t txg)
 
 	dsl_pool_sync_done(dp, txg);
 
-	/*
-	 * Update usable space statistics.
-	 */
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 211adae0968..53a3b927d52 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4246,17 +4246,39 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+static void
+metaslab_sync_done_task(void *arg)
+{
+	metaslab_t *msp = arg;
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	metaslab_sync_done(msp, spa_syncing_txg(spa));
+}
+
+void
+vdev_sync_dispatch(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(vdev_is_concrete(vd));
+
+	for (metaslab_t *msp = txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg));
+	    msp; msp = txg_list_next(&vd->vdev_ms_list, msp, TXG_CLEAN(txg))) {
+		(void) taskq_dispatch(spa->spa_sync_tq,
+		    metaslab_sync_done_task, msp, TQ_SLEEP);
+	}
+}
+
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
-	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
-	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
-	    != NULL)
-		metaslab_sync_done(msp, txg);
+	taskq_wait(vd->vdev_spa->spa_sync_tq);
+
+	while (txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)) != NULL)
+		;
 
 	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);

From a65ed7afd392222b5f1c3d0977d7c6f6a8edc34f Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Thu, 4 Jun 2026 22:39:00 +0200
Subject: [PATCH 111/129] zpool/zfs: accept --help and -? after a subcommand

Print the short usage instead of "invalid option".

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18541
---
 cmd/zfs/zfs_main.c     | 12 ++++++++++++
 cmd/zpool/zpool_main.c | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 4c21c92bcd2..d448a3df60e 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -9399,6 +9399,18 @@ main(int argc, char **argv)
 		return (1);
 	}
 
+	/*
+	 * Special case '<subcommand> --help|-?'
+	 */
+	if (argc >= 3 && (strcmp(argv[2], "--help") == 0 ||
+	    strcmp(argv[2], "-?") == 0)) {
+		int idx;
+		if (find_command_idx(cmdname, &idx) == 0) {
+			current_command = &command_table[idx];
+			usage(B_FALSE);
+		}
+	}
+
 	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
 
 	libzfs_print_on_error(g_zfs, B_TRUE);
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 3ed7babc1ca..05ea5e35446 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -13878,6 +13878,18 @@ main(int argc, char **argv)
 	if (strcmp(cmdname, "help") == 0)
 		return (zpool_do_help(argc, argv));
 
+	/*
+	 * Special case '<subcommand> --help|-?'
+	 */
+	if (argc >= 3 && (strcmp(argv[2], "--help") == 0 ||
+	    strcmp(argv[2], "-?") == 0)) {
+		int idx;
+		if (find_command_idx(cmdname, &idx) == 0) {
+			current_command = &command_table[idx];
+			usage(B_FALSE);
+		}
+	}
+
 	if ((g_zfs = libzfs_init()) == NULL) {
 		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
 		return (1);

From 2076569ce886058db860232d4db84443c05044a1 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 4 Jun 2026 16:45:13 -0700
Subject: [PATCH 112/129] Remove /etc/sudoers.d/zfs

The smartctl exception in /etc/sudoers.d/zfs doesn't cover devices
like NVMe or symlinked devices.  Just get rid of it rather than
keep maintaining it.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18626
---
 contrib/debian/not-installed | 1 -
 etc/Makefile.am              | 6 ------
 etc/sudoers.d/zfs            | 9 ---------
 man/man8/zpool-iostat.8      | 5 +----
 rpm/generic/zfs.spec.in      | 1 -
 5 files changed, 1 insertion(+), 21 deletions(-)
 delete mode 100644 etc/sudoers.d/zfs

diff --git a/contrib/debian/not-installed b/contrib/debian/not-installed
index 9c08da5a6a7..efe17c90c3b 100644
--- a/contrib/debian/not-installed
+++ b/contrib/debian/not-installed
@@ -2,7 +2,6 @@ usr/bin/zarcsummary.py
 usr/share/zfs/zfs-helpers.sh
 etc/default/zfs
 etc/init.d
-etc/sudoers.d
 etc/zfs/vdev_id.conf.alias.example
 etc/zfs/vdev_id.conf.multipath.example
 etc/zfs/vdev_id.conf.sas_direct.example
diff --git a/etc/Makefile.am b/etc/Makefile.am
index 2bea12ae514..5168c3cde13 100644
--- a/etc/Makefile.am
+++ b/etc/Makefile.am
@@ -1,10 +1,4 @@
 # SPDX-License-Identifier: CDDL-1.0
-sudoersddir = $(sysconfdir)/sudoers.d
-sudoersd_DATA = \
-	%D%/sudoers.d/zfs
-
-dist_noinst_DATA += $(sudoersd_DATA)
-
 
 sysconf_zfsdir = $(sysconfdir)/zfs
 
diff --git a/etc/sudoers.d/zfs b/etc/sudoers.d/zfs
deleted file mode 100644
index 82a25ba81ec..00000000000
--- a/etc/sudoers.d/zfs
+++ /dev/null
@@ -1,9 +0,0 @@
-##
-## Allow any user to run `zpool iostat/status -c smart` in order
-## to read basic SMART health statistics for a pool.
-##
-## CAUTION: Any syntax error introduced here will break sudo.
-## Editing with 'visudo' is recommended: visudo -f  /etc/sudoers.d/zfs 
-##
-
-# ALL ALL = (root) NOPASSWD: /usr/sbin/smartctl -a /dev/[hsv]d[a-z0-9]*
diff --git a/man/man8/zpool-iostat.8 b/man/man8/zpool-iostat.8
index 4abe0895064..16d469849ee 100644
--- a/man/man8/zpool-iostat.8
+++ b/man/man8/zpool-iostat.8
@@ -109,10 +109,7 @@ environment variable set.
 If a script requires the use of a privileged command, like
 .Xr smartctl 8 ,
 then it's recommended you allow the user access to it in
-.Pa /etc/sudoers
-or add the user to the
-.Pa /etc/sudoers.d/zfs
-file.
+.Pa /etc/sudoers .
 .Pp
 If
 .Fl c
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 48ed7bf2eb7..71923a7808e 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -525,7 +525,6 @@ systemctl --system daemon-reload >/dev/null || true
 %config(noreplace) %{_sysconfdir}/%{name}/zed.d/*
 %config(noreplace) %{_sysconfdir}/%{name}/zpool.d/*
 %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example
-%attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*
 
 %config(noreplace) %{_bashcompletiondir}/zfs
 %config(noreplace) %{_bashcompletiondir}/zpool

From a851ba8eb9ac31353619ed53ae6a107ed33cefab Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 4 Jun 2026 16:55:48 -0700
Subject: [PATCH 113/129] CI: Re-enable CodeQL workflows on push

This workflow was disabled 'on push' recently in commit 1916c2c5
to reduce redundant CI runs.  However, this check is fairly quick
and we want it run regularly against the branches.  Enable it.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18627
---
 .github/workflows/codeql.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 04ad7fae711..689fe71fddc 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -11,7 +11,6 @@ concurrency:
 jobs:
   analyze:
     name: Analyze
-    if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
     runs-on: ubuntu-22.04
     permissions:
       actions: read

From cae1421e8dec504193409f7b7ba1c526a8fbc773 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 4 Jun 2026 17:39:39 -0700
Subject: [PATCH 114/129] CI: Update CodeQL actions to v4

CodeQL Action v3 has been deprecated and will be retired
December 2026.  Update codeql.yml to use CodeQL Action v4
and update the runner to ubuntu-24.04.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18629
---
 .github/workflows/codeql.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 689fe71fddc..fbaf53dc61e 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -11,7 +11,7 @@ concurrency:
 jobs:
   analyze:
     name: Analyze
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     permissions:
       actions: read
       contents: read
@@ -31,15 +31,15 @@ jobs:
       uses: actions/checkout@v6
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
       with:
         config-file: .github/codeql-${{ matrix.language }}.yml
         languages: ${{ matrix.language }}
 
     - name: Autobuild
-      uses: github/codeql-action/autobuild@v3
+      uses: github/codeql-action/autobuild@v4
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}"

From c4d0f3dd4101c360ebf0053beabd263c167bcc63 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 4 Jun 2026 17:41:11 -0700
Subject: [PATCH 115/129] CI: Increase default RCU stall timeout on Linux

When CONFIG_RCU_CPU_STALL_TIMEOUT is configured an RCU stall which
exceeds the default timeout will trigger an NMI and panic the VM.
Given the heavily virtualized nature of the CI environment we want
to make sure to only trigger this due to a real deadlock and not
due to over-subscription of the systems resources.  This timeout
normally defaults to 20-30 seconds and this change increases it
to 120 seconds.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18624
---
 .github/workflows/scripts/qemu-6-tests.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh
index 41c34511357..a0612e5e0b2 100755
--- a/.github/workflows/scripts/qemu-6-tests.sh
+++ b/.github/workflows/scripts/qemu-6-tests.sh
@@ -186,6 +186,13 @@ case "$OS" in
     sudo mount -o noatime /dev/vdb /var/tmp
     sudo chmod 1777 /var/tmp
     sudo mv -f /tmp/*.txt /var/tmp
+
+    # Allow for longer RCU timeouts due to the heavily virtualized and
+    # potentially oversubscribed nature of the CI environment.
+    rcu_cpu_stall_timeout="/sys/module/rcupdate/parameters/rcu_cpu_stall_timeout"
+    if test -f $rcu_cpu_stall_timeout; then
+        echo 120 | sudo sh -c "cat > '$rcu_cpu_stall_timeout'"
+    fi
     ;;
 esac
 

From 6cc449260757fbe1c921e4714cf2ad0a12e65753 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 4 Jun 2026 17:52:36 -0700
Subject: [PATCH 116/129] CI: Add alternative URLs for CentOS stream

Fallback to trying the "CentOS Strean Composes" repo for the qcow2
images if the regular URLs fail.  The Composes repo contains the daily
autobuilt Stream images.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18628
---
 .github/workflows/scripts/qemu-2-start.sh | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 9770c8903cc..7e72030adb9 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -28,6 +28,7 @@ NIC="virtio"
 # additional options for virt-install
 OPTS[0]=""
 OPTS[1]=""
+ALT_URL=""
 
 case "$OS" in
   almalinux8)
@@ -56,11 +57,22 @@ case "$OS" in
   centos-stream9)
     OSNAME="CentOS Stream 9"
     URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2"
+
+    # Sometimes we get HTTP errors for the first link.  Fall back to the
+    # "Composes" repo as an alternative.  The "Composes" repo includes
+    # autogenerated nightly CentOS Stream images.  We have to lookup the URL
+    # dynamically since the qcow2 file name has the date in it.
+    ALT_URL=$(wget --accept "CentOS-Stream-GenericCloud-9-*.x86_64.qcow2" --spider -np --recursive  --no-verbose \
+              https://composes.stream.centos.org/stream-9/production/latest-CentOS-Stream/compose/BaseOS/x86_64/images/ 2>&1 | \
+              awk '/200 OK/{print $(NF-2)}')
     ;;
   centos-stream10)
     OSNAME="CentOS Stream 10"
     OSv="centos-stream9"
     URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2"
+    ALT_URL=$(wget --accept "CentOS-Stream-GenericCloud-10-*.x86_64.qcow2" --spider -np --recursive  --no-verbose \
+              https://composes.stream.centos.org/stream-10/production/latest-CentOS-Stream/compose/BaseOS/x86_64/images/ 2>&1 | \
+              awk '/200 OK/{print $(NF-2)}')
     ;;
   debian11)
     OSNAME="Debian 11"
@@ -204,6 +216,16 @@ for cmd in 'axel -q -o' 'curl --fail -LSs -o' ; do
   if [ -s "$IMG" ] ; then
     # Successful download
     break
+  else
+    if [ -n "$ALT_URL" ] ; then
+      # Try the $ALT_URL if specified
+      echo "Loading alternative $ALT_URL with $cmd..."
+      time eval "$cmd $IMG $ALT_URL"
+      if [ -s "$IMG" ]; then
+        # Successful ALT_URL download
+        break
+      fi
+    fi
   fi
 done
 

From cab50d5adb84bc1315205399e4b9c7c724b5a729 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 5 Jun 2026 09:48:55 -0700
Subject: [PATCH 117/129] Add additional verification of size fields and
 strings (#18623)

- Check for size fields that convert to smaller integers.
- Explicitly terminate bootenv string.
- Initialize variables that could be returned in an error case.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Chris Longros <chris.longros@gmail.com>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18623
---
 module/zfs/dmu_recv.c   | 44 +++++++++++++++++++++++++++++------------
 module/zfs/vdev_label.c |  1 +
 module/zfs/zfs_ioctl.c  |  2 +-
 module/zfs/zfs_quota.c  |  8 ++++++--
 4 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index fa18a2056bb..74874bb65d3 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2901,16 +2901,20 @@ receive_read_record(dmu_recv_cookie_t *drc)
 	{
 		struct drr_object *drro =
 		    &drc->drc_rrd->header.drr_u.drr_object;
-		uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+		uint32_t size;
 		void *buf = NULL;
 		dmu_object_info_t doi;
 
+		size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+		if (size > SPA_MAXBLOCKSIZE)
+			return (SET_ERROR(ERANGE));
+
 		if (size != 0)
-			buf = kmem_zalloc(size, KM_SLEEP);
+			buf = vmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
-			kmem_free(buf, size);
+			vmem_free(buf, size);
 			return (err);
 		}
 		err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
@@ -2934,7 +2938,11 @@ receive_read_record(dmu_recv_cookie_t *drc)
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
-		int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+		uint64_t size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+
+		if (size > SPA_MAXBLOCKSIZE)
+			return (SET_ERROR(ERANGE));
+
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
@@ -2951,12 +2959,18 @@ receive_read_record(dmu_recv_cookie_t *drc)
 	{
 		struct drr_write_embedded *drrwe =
 		    &drc->drc_rrd->header.drr_u.drr_write_embedded;
-		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
-		void *buf = kmem_zalloc(size, KM_SLEEP);
+		uint32_t size;
+		void *buf;
+
+		size = P2ROUNDUP(drrwe->drr_psize, 8);
+		if (size > SPA_MAXBLOCKSIZE)
+			return (SET_ERROR(ERANGE));
+
+		buf = vmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
-			kmem_free(buf, size);
+			vmem_free(buf, size);
 			return (err);
 		}
 
@@ -2985,7 +2999,11 @@ receive_read_record(dmu_recv_cookie_t *drc)
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
-		int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+		uint64_t size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+
+		if (size > SPA_MAXBLOCKSIZE)
+			return (SET_ERROR(ERANGE));
+
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
@@ -3136,7 +3154,7 @@ receive_process_record(struct receive_writer_arg *rwa,
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (rrd->payload != NULL) {
-			kmem_free(rrd->payload, rrd->payload_size);
+			vmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		return (0);
@@ -3150,7 +3168,7 @@ receive_process_record(struct receive_writer_arg *rwa,
 				rrd->abd = NULL;
 				rrd->payload = NULL;
 			} else if (rrd->payload != NULL) {
-				kmem_free(rrd->payload, rrd->payload_size);
+				vmem_free(rrd->payload, rrd->payload_size);
 				rrd->payload = NULL;
 			}
 
@@ -3163,7 +3181,7 @@ receive_process_record(struct receive_writer_arg *rwa,
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		err = receive_object(rwa, drro, rrd->payload);
-		kmem_free(rrd->payload, rrd->payload_size);
+		vmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
@@ -3201,7 +3219,7 @@ receive_process_record(struct receive_writer_arg *rwa,
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		err = receive_write_embedded(rwa, drrwe, rrd->payload);
-		kmem_free(rrd->payload, rrd->payload_size);
+		vmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
@@ -3270,7 +3288,7 @@ receive_writer_thread(void *arg)
 			rrd->abd = NULL;
 			rrd->payload = NULL;
 		} else if (rrd->payload != NULL) {
-			kmem_free(rrd->payload, rrd->payload_size);
+			vmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		/*
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 54d253c1b7d..e6da5c1707a 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1405,6 +1405,7 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
 				    VB_NVLIST);
 				break;
 			}
+			vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
 			fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
 		}
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index aeefab4fa64..a23f397e698 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -7180,7 +7180,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
 	const char *firstsnap;
-	uint64_t used, comp, uncomp;
+	uint64_t used = 0, comp = 0, uncomp = 0;
 
 	firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
 
diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c
index 85b7a549b9a..0b51f8669cb 100644
--- a/module/zfs/zfs_quota.c
+++ b/module/zfs/zfs_quota.c
@@ -86,10 +86,14 @@ zpl_get_file_info(dmu_object_type_t bonustype, const void *data,
 		sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 		swap = B_TRUE;
 	}
-	VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
+
+	if (unlikely(sa.sa_magic != SA_MAGIC))
+		return (SET_ERROR(EINVAL));
 
 	int hdrsize = sa_hdrsize(&sa);
-	VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
+
+	if (unlikely(hdrsize < sizeof (sa_hdr_phys_t)))
+		return (SET_ERROR(EINVAL));
 
 	uintptr_t data_after_hdr = (uintptr_t)data + hdrsize;
 	zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET));

From 9ff3fdfc54d69792a60871aaac09d610c762cc10 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 4 Jun 2026 23:37:10 +1000
Subject: [PATCH 118/129] zap: remove zap_increment_int()

This is a strange function that can't possibly work sensibly with
zap_add_int()/zap_remove_int()/zap_lookup_int(), as it allows the key
and value to diverge, which is not how these functions appear to work.

It would make more sense if it were called zap_increment_int_key(), as
that family can have divergent key and value. But it doesn't.

Fortunately, nothing uses it. There was a function named
zap_increment_int() in Sun ZFS, that was renamed to zap_increment()
early in ZoL's lifetime, and is unrelated.

So, remove it, and fix up some very old comments referring to it.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 include/sys/zap.h       |  2 --
 module/zfs/dmu_objset.c |  2 +-
 module/zfs/zap.c        | 11 -----------
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index ad20d427ad9..cde4c72764a 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -402,8 +402,6 @@ int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
-int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx);
 
 /* Here the key is an int and the value is a different int. */
 int zap_add_int_key(objset_t *os, uint64_t obj,
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 4919ead3cea..654afe2f844 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1859,7 +1859,7 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
 	    &cookie)) != NULL) {
 		/*
 		 * os_userused_lock protects against concurrent calls to
-		 * zap_increment_int().  It's needed because zap_increment_int()
+		 * zap_increment().  It's needed because zap_increment()
 		 * is not thread-safe (i.e. not atomic).
 		 */
 		mutex_enter(&os->os_userused_lock);
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index ee94917d8e8..e14ff1027f6 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -1029,16 +1029,6 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
-int
-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
-	return (zap_increment(os, obj, name, delta, tx));
-}
-
 /* zap_*_int_key */
 
 int
@@ -1329,7 +1319,6 @@ EXPORT_SYMBOL(zap_join_increment);
 EXPORT_SYMBOL(zap_add_int);
 EXPORT_SYMBOL(zap_remove_int);
 EXPORT_SYMBOL(zap_lookup_int);
-EXPORT_SYMBOL(zap_increment_int);
 EXPORT_SYMBOL(zap_add_int_key);
 EXPORT_SYMBOL(zap_lookup_int_key);
 EXPORT_SYMBOL(zap_increment);

From 47b53fda7d23b4867ea3c6db2014d60960184277 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 5 Jun 2026 15:29:16 +1000
Subject: [PATCH 119/129] zap: remove zap_join() functions

These work, but are limited in their focus (single uint64_t key). The
last use anywhere was removed in d4a72f2386 (~2017). Better to remove
them rather than bother to uplift them to the new _by_dnode() structure.
They're simple to recreate if we ever do need them again.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 include/sys/zap.h | 15 ---------
 module/zfs/zap.c  | 84 -----------------------------------------------
 2 files changed, 99 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index cde4c72764a..2fd7332e124 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -380,21 +380,6 @@ int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
 int zap_value_search(objset_t *os, uint64_t zapobj,
     uint64_t value, uint64_t mask, char *name, uint64_t namelen);
 
-/*
- * Transfer all the entries from fromobj into intoobj.  Only works on
- * int_size=8 num_integers=1 values.  Fails if there are any duplicated
- * entries.
- */
-int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
-
-/* Same as zap_join, but set the values to 'value'. */
-int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    uint64_t value, dmu_tx_t *tx);
-
-/* Same as zap_join, but add together any duplicated entries. */
-int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    dmu_tx_t *tx);
-
 /*
  * Manipulate entries where the name + value are the "same" (the name is
  * a stringified version of the value).
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index e14ff1027f6..76582a25802 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -918,88 +918,6 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 	return (err);
 }
 
-/* zap_join */
-
-int
-zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	int err = 0;
-
-	zap_attribute_t *za = zap_attribute_long_alloc();
-	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    (void) zap_cursor_advance(&zc)) {
-		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-		err = zap_add(os, intoobj, za->za_name,
-		    8, 1, &za->za_first_integer, tx);
-		if (err != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	zap_attribute_free(za);
-	return (err);
-}
-
-int
-zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    uint64_t value, dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	int err = 0;
-
-	zap_attribute_t *za = zap_attribute_long_alloc();
-	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    (void) zap_cursor_advance(&zc)) {
-		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-		err = zap_add(os, intoobj, za->za_name,
-		    8, 1, &value, tx);
-		if (err != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	zap_attribute_free(za);
-	return (err);
-}
-
-int
-zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	int err = 0;
-
-	zap_attribute_t *za = zap_attribute_long_alloc();
-	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    (void) zap_cursor_advance(&zc)) {
-		uint64_t delta = 0;
-
-		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-
-		err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
-		if (err != 0 && err != ENOENT)
-			break;
-		delta += za->za_first_integer;
-		err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
-		if (err != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	zap_attribute_free(za);
-	return (err);
-}
-
 /* zap_*_int */
 
 int
@@ -1314,8 +1232,6 @@ EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_count_by_dnode);
 EXPORT_SYMBOL(zap_value_search);
-EXPORT_SYMBOL(zap_join);
-EXPORT_SYMBOL(zap_join_increment);
 EXPORT_SYMBOL(zap_add_int);
 EXPORT_SYMBOL(zap_remove_int);
 EXPORT_SYMBOL(zap_lookup_int);

From 1266435523bfe37a3be019a292190e8197d74139 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 14:08:40 +1000
Subject: [PATCH 120/129] zap: add zap_increment_by_dnode()

Make consistent with the standard pattern, with zap_increment() becoming
a simple wrapper around zap_increment_by_dnode().

This has a small, likely unnoticeable, behaviour change. The previous
version didn't use the _by_dnode() functions, so the ZAP, dnode and dbuf
could theoretically be evicted between calls. With the dnode held across
the calls, this won't happen anymore. This is almost certainly a good
thing.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 include/sys/zap.h |  2 ++
 module/zfs/zap.c  | 21 +++++++++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index 2fd7332e124..843f1b4004f 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -370,6 +370,8 @@ int zap_count_by_dnode(dnode_t *dn, uint64_t *count);
  */
 int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
     dmu_tx_t *tx);
+int zap_increment_by_dnode(dnode_t *dn, const char *name, int64_t delta,
+    dmu_tx_t *tx);
 
 /*
  * Returns (in name) the name of the entry whose (value & mask)
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 76582a25802..8be6aa7299f 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -872,7 +872,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 /* zap_increment */
 
 int
-zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+zap_increment_by_dnode(dnode_t *dn, const char *name, int64_t delta,
     dmu_tx_t *tx)
 {
 	uint64_t value = 0;
@@ -880,14 +880,27 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
 	if (delta == 0)
 		return (0);
 
-	int err = zap_lookup(os, obj, name, 8, 1, &value);
+	int err = zap_lookup_by_dnode(dn, name, 8, 1, &value);
 	if (err != 0 && err != ENOENT)
 		return (err);
 	value += delta;
 	if (value == 0)
-		err = zap_remove(os, obj, name, tx);
+		err = zap_remove_by_dnode(dn, name, tx);
 	else
-		err = zap_update(os, obj, name, 8, 1, &value, tx);
+		err = zap_update_by_dnode(dn, name, 8, 1, &value, tx);
+	return (err);
+}
+
+int
+zap_increment(objset_t *os, uint64_t zapobj, const char *name, int64_t delta,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, zapobj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = zap_increment_by_dnode(dn, name, delta, tx);
+	dnode_rele(dn, FTAG);
 	return (err);
 }
 

From fd70c222f98b07e42b7edbff0a5dad23e5ac82e5 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sat, 23 May 2026 16:16:50 +1000
Subject: [PATCH 121/129] unit/zap: zap_increment

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 tests/unit/test_zap.c | 51 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index 276b6127e3f..2c232218860 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -448,6 +448,55 @@ test_zap_length(const MunitParameter params[], void *data)
 	return (MUNIT_OK);
 }
 
+/* zap_increment: add integer value to existing integer */
+static MunitResult
+test_zap_increment(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	uint64_t r = 0;
+
+	/* Increment a missing key creates it with that value. */
+	unit_ok(zap_increment_by_dnode(dn, "a", 5, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r));
+	unit_eq(r, 5);
+
+	/* Further increments accumulate. */
+	unit_ok(zap_increment_by_dnode(dn, "a", 3, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r));
+	unit_eq(r, 8);
+
+	/* Decrement works. */
+	unit_ok(zap_increment_by_dnode(dn, "a", -2, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r));
+	unit_eq(r, 6);
+
+	/* Zero delta leaves it unchanged. */
+	r = 0;
+	unit_ok(zap_increment_by_dnode(dn, "a", 0, tx));
+	unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r));
+	unit_eq(r, 6);
+
+	/* Decrementing to zero removes the entry. */
+	unit_ok(zap_increment_by_dnode(dn, "a", -6, tx));
+	unit_err(zap_lookup_by_dnode(dn, "a",
+	    sizeof (uint64_t), 1, &r), ENOENT);
+
+	/* Delta of zero is a no-op even for a missing key. */
+	unit_ok(zap_increment_by_dnode(dn, "a", 0, tx));
+	unit_err(zap_lookup_by_dnode(dn, "a",
+	    sizeof (uint64_t), 1, &r), ENOENT);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
 /* ========== */
 
 /*
@@ -860,6 +909,8 @@ static const MunitTest zap_tests[] = {
 	UNIT_TEST_ZAP_TYPES("zap_contains",	test_zap_contains),
 	UNIT_TEST_ZAP_TYPES("zap_length",	test_zap_length),
 
+	UNIT_TEST_ZAP_TYPES("zap_increment",	test_zap_increment),
+
 	UNIT_TEST("microzap_stats",		test_microzap_stats),
 	UNIT_TEST("fatzap_stats",		test_fatzap_stats),
 

From 0ff134fbdde3c15cf3e43a3018282fbe7192ecf1 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 15:29:26 +1000
Subject: [PATCH 122/129] zap: add _by_dnode() variants for int and int_key
 functions

These functions are far too simple to make wrapping worthwhile, so
instead we just lift the important shared bit - the value->string
conversion - into a small macro, and use it in all of them.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 include/sys/zap.h | 11 ++++++++
 module/zfs/zap.c  | 67 ++++++++++++++++++++++++++++++++++-------------
 2 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index 843f1b4004f..632e7d02050 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -390,6 +390,10 @@ int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
 
+int zap_add_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx);
+int zap_remove_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_by_dnode(dnode_t *dn, uint64_t value);
+
 /* Here the key is an int and the value is a different int. */
 int zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx);
@@ -398,6 +402,13 @@ int zap_update_int_key(objset_t *os, uint64_t obj,
 int zap_lookup_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t *valuep);
 
+int zap_add_int_key_by_dnode(dnode_t *dn,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_update_int_key_by_dnode(dnode_t *dn,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key_by_dnode(dnode_t *dn,
+    uint64_t key, uint64_t *valuep);
+
 /*
  * The interface for listing all the attributes of a zapobj can be
  * thought of as cursor moving down a list of the attributes one by
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 8be6aa7299f..b0aef12d119 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -933,63 +933,94 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 
 /* zap_*_int */
 
+#define	FORMAT_INT_KEY(name, value)	\
+	char name[20];			\
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+
 int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	FORMAT_INT_KEY(name, value);
 	return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
+int
+zap_add_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx)
+{
+	FORMAT_INT_KEY(name, value);
+	return (zap_add_by_dnode(dn, name, 8, 1, &value, tx));
+}
 
 int
 zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	FORMAT_INT_KEY(name, value);
 	return (zap_remove(os, obj, name, tx));
 }
+int
+zap_remove_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx)
+{
+	FORMAT_INT_KEY(name, value);
+	return (zap_remove_by_dnode(dn, name, tx));
+}
 
 int
 zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 {
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	FORMAT_INT_KEY(name, value);
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
+int
+zap_lookup_int_by_dnode(dnode_t *dn, uint64_t value)
+{
+	FORMAT_INT_KEY(name, value);
+	return (zap_lookup_by_dnode(dn, name, 8, 1, &value));
+}
+
 /* zap_*_int_key */
 
 int
 zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	FORMAT_INT_KEY(name, key);
 	return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
+int
+zap_add_int_key_by_dnode(dnode_t *dn,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	FORMAT_INT_KEY(name, key);
+	return (zap_add_by_dnode(dn, name, 8, 1, &value, tx));
+}
 
 int
 zap_update_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	FORMAT_INT_KEY(name, key);
 	return (zap_update(os, obj, name, 8, 1, &value, tx));
 }
+int
+zap_update_int_key_by_dnode(dnode_t *dn,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	FORMAT_INT_KEY(name, key);
+	return (zap_update_by_dnode(dn, name, 8, 1, &value, tx));
+}
 
 int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	FORMAT_INT_KEY(name, key);
 	return (zap_lookup(os, obj, name, 8, 1, valuep));
 }
+int
+zap_lookup_int_key_by_dnode(dnode_t *dn, uint64_t key, uint64_t *valuep)
+{
+	FORMAT_INT_KEY(name, key);
+	return (zap_lookup_by_dnode(dn, name, 8, 1, valuep));
+}
 
 /* zap_cursor */
 

From c869c0f240bf7fa5d73d21c09b469bbc3baba28e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 29 May 2026 21:50:31 +1000
Subject: [PATCH 123/129] unit/zap: zap_*_int and zap_*_int_key

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 tests/unit/test_zap.c | 91 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index 2c232218860..ae5f23f2bb4 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -499,6 +499,94 @@ test_zap_increment(const MunitParameter params[], void *data)
 
 /* ========== */
 
+/*
+ * zap_add_int/zap_remove_int/zap_lookup_int: single uint64_t value,
+ * stringified to form the key.
+ */
+static MunitResult
+test_zap_int(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* Add some ints. */
+	unit_ok(zap_add_int_by_dnode(dn, 5, tx));
+	unit_ok(zap_add_int_by_dnode(dn, 17, tx));
+
+	/* Confirm they're there. */
+	unit_ok(zap_lookup_int_by_dnode(dn, 17));
+	unit_ok(zap_lookup_int_by_dnode(dn, 5));
+
+	/* But not something we didn't add. */
+	unit_err(zap_lookup_int_by_dnode(dn, 23), ENOENT);
+
+	/* Adding something that already exists fails. */
+	unit_err(zap_add_int_by_dnode(dn, 17, tx), EEXIST);
+
+	/* Removing it works, and then it can't be found. */
+	unit_ok(zap_remove_int_by_dnode(dn, 17, tx));
+	unit_err(zap_lookup_int_by_dnode(dn, 17), ENOENT);
+
+	/* Add it can be added back. */
+	unit_ok(zap_add_int_by_dnode(dn, 17, tx));
+	unit_ok(zap_lookup_int_by_dnode(dn, 17));
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_*_int_key: like zap_*_int, but with separate value. */
+static MunitResult
+test_zap_int_keys(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* Add some ints. */
+	unit_ok(zap_add_int_key_by_dnode(dn, 5, 17, tx));
+	unit_ok(zap_add_int_key_by_dnode(dn, 23, 35, tx));
+
+	/* Confirm they're there. */
+	uint64_t r = 0;
+	unit_ok(zap_lookup_int_key_by_dnode(dn, 5, &r));
+	unit_eq(r, 17);
+	unit_ok(zap_lookup_int_key_by_dnode(dn, 23, &r));
+	unit_eq(r, 35);
+
+	/* But not something we didn't add. */
+	unit_err(zap_lookup_int_key_by_dnode(dn, 79, &r), ENOENT);
+
+	/* Adding something that already exists fails. */
+	unit_err(zap_add_int_key_by_dnode(dn, 23, 51, tx), EEXIST);
+
+	/* Updating it works though. */
+	unit_ok(zap_update_int_key_by_dnode(dn, 23, 51, tx));
+
+	/* Removing it works, and then it can't be found. */
+	unit_ok(zap_remove_int_by_dnode(dn, 23, tx));
+	unit_err(zap_lookup_int_key_by_dnode(dn, 23, &r), ENOENT);
+
+	/* Add it can be added back. */
+	unit_ok(zap_add_int_key_by_dnode(dn, 23, 11, tx));
+	unit_ok(zap_lookup_int_key_by_dnode(dn, 23, &r));
+	unit_eq(r, 11);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
 /*
  * Separate stats tests for each ZAP type, since they are about internals and
  * so can and will produce different results.
@@ -911,6 +999,9 @@ static const MunitTest zap_tests[] = {
 
 	UNIT_TEST_ZAP_TYPES("zap_increment",	test_zap_increment),
 
+	UNIT_TEST_ZAP_TYPES("zap_int",		test_zap_int),
+	UNIT_TEST_ZAP_TYPES("zap_int_keys",	test_zap_int_keys),
+
 	UNIT_TEST("microzap_stats",		test_microzap_stats),
 	UNIT_TEST("fatzap_stats",		test_fatzap_stats),
 

From 089a54fc19650abe284139181f0304891d537d8f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Sun, 10 May 2026 15:35:41 +1000
Subject: [PATCH 124/129] zap: add zap_value_search_by_dnode()

This operates entirely on a cursor, so the two entry points just
instantiate a new cursor and then pass it into the worker.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 include/sys/zap.h |  2 ++
 module/zfs/zap.c  | 30 +++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/sys/zap.h b/include/sys/zap.h
index 632e7d02050..50e7079e014 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -381,6 +381,8 @@ int zap_increment_by_dnode(dnode_t *dn, const char *name, int64_t delta,
  */
 int zap_value_search(objset_t *os, uint64_t zapobj,
     uint64_t value, uint64_t mask, char *name, uint64_t namelen);
+int zap_value_search_by_dnode(dnode_t *dn,
+    uint64_t value, uint64_t mask, char *name, uint64_t namelen);
 
 /*
  * Manipulate entries where the name + value are the "same" (the name is
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index b0aef12d119..ca7598f489b 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -906,31 +906,47 @@ zap_increment(objset_t *os, uint64_t zapobj, const char *name, int64_t delta,
 
 /* zap_value_search */
 
-int
-zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+static int
+zap_value_search_impl(zap_cursor_t *zc, uint64_t value, uint64_t mask,
     char *name, uint64_t namelen)
 {
-	zap_cursor_t zc;
 	int err;
 
 	if (mask == 0)
 		mask = -1ULL;
 
 	zap_attribute_t *za = zap_attribute_long_alloc();
-	for (zap_cursor_init(&zc, os, zapobj);
-	    (err = zap_cursor_retrieve(&zc, za)) == 0;
-	    zap_cursor_advance(&zc)) {
+	for (; (err = zap_cursor_retrieve(zc, za)) == 0;
+	    zap_cursor_advance(zc)) {
 		if ((za->za_first_integer & mask) == (value & mask)) {
 			if (strlcpy(name, za->za_name, namelen) >= namelen)
 				err = SET_ERROR(ENAMETOOLONG);
 			break;
 		}
 	}
-	zap_cursor_fini(&zc);
+	zap_cursor_fini(zc);
 	zap_attribute_free(za);
 	return (err);
 }
 
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+    char *name, uint64_t namelen)
+{
+	zap_cursor_t zc;
+	zap_cursor_init(&zc, os, zapobj);
+	return (zap_value_search_impl(&zc, value, mask, name, namelen));
+}
+
+int
+zap_value_search_by_dnode(dnode_t *dn, uint64_t value, uint64_t mask,
+    char *name, uint64_t namelen)
+{
+	zap_cursor_t zc;
+	zap_cursor_init_by_dnode(&zc, dn);
+	return (zap_value_search_impl(&zc, value, mask, name, namelen));
+}
+
 /* zap_*_int */
 
 #define	FORMAT_INT_KEY(name, value)	\

From 63fad3403c79f9c89f634f5c921c1736c0844899 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Fri, 5 Jun 2026 14:21:38 +1000
Subject: [PATCH 125/129] unit: rand helpers

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 tests/unit/unit.c | 20 ++++++++++++++++++++
 tests/unit/unit.h |  4 ++++
 2 files changed, 24 insertions(+)

diff --git a/tests/unit/unit.c b/tests/unit/unit.c
index 81b2e93975f..3dd2e7de5d5 100644
--- a/tests/unit/unit.c
+++ b/tests/unit/unit.c
@@ -24,6 +24,7 @@
 #include <sys/zfs_debug.h>
 
 #include "munit.h"
+#include "unit.h"
 
 /*
  * SET_ERROR() expands to __set_error() in debug builds. It's an
@@ -83,3 +84,22 @@ cmn_err(int ce, const char *fmt, ...)
 		break;
 	}
 }
+
+/* helpers to generate useful random data */
+uint64_t
+unit_rand_uint64(void)
+{
+	uint64_t v =
+	    (((uint64_t)munit_rand_uint32()) << 32) |
+	    ((uint64_t)munit_rand_uint32());
+	return (v);
+}
+
+char *
+unit_rand_str(char *buf, size_t bufsz)
+{
+	for (int i = 0; i < bufsz-1; i++)
+		buf[i] = munit_rand_int_range('a', 'z');
+	buf[bufsz-1] = '\0';
+	return (buf);
+}
diff --git a/tests/unit/unit.h b/tests/unit/unit.h
index 6b655082092..a8c23da4118 100644
--- a/tests/unit/unit.h
+++ b/tests/unit/unit.h
@@ -57,4 +57,8 @@
 #define	unit_ok(a)	munit_assert_int((a), ==, 0)
 #define	unit_err(a, e)	munit_assert_int((a), ==, (e))
 
+/* helpers to generate useful random data */
+extern uint64_t unit_rand_uint64(void);
+extern char *unit_rand_str(char *buf, size_t bufsz);
+
 #endif /* UNIT_H */

From cf80080a0f5d1fef6268e2ac72287e22647022f2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Wed, 6 May 2026 10:12:47 +1000
Subject: [PATCH 126/129] unit/zap: zap_value_search

Two separate tests, one for basic operation and one for the "value mask"
feature.

Sponsored-by: TrueNAS
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18630
---
 tests/unit/test_zap.c | 127 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/tests/unit/test_zap.c b/tests/unit/test_zap.c
index ae5f23f2bb4..c64de7d75c4 100644
--- a/tests/unit/test_zap.c
+++ b/tests/unit/test_zap.c
@@ -971,6 +971,128 @@ test_cursor_release_one(const MunitParameter params[], void *data)
 
 /* ========== */
 
+/* zap_value_search: find key with given uint64 value. */
+static MunitResult
+test_zap_value_search(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/* Add some items. */
+	uint64_t v1 = 1, v2 = 2, v3 = 3;
+	unit_ok(zap_add_by_dnode(dn, "one", sizeof (uint64_t), 1, &v1, tx));
+	unit_ok(zap_add_by_dnode(dn, "two", sizeof (uint64_t), 1, &v2, tx));
+	unit_ok(zap_add_by_dnode(dn, "three", sizeof (uint64_t), 1, &v3, tx));
+
+	char name[ZAP_MAXNAMELEN];
+
+	/* Find one of them. */
+	unit_ok(zap_value_search_by_dnode(dn, 2, 0, name, sizeof (name)));
+	unit_str_eq(name, "two");
+
+	/* Nonexistent value. */
+	unit_err(zap_value_search_by_dnode(dn, 10, 0,
+	    name, sizeof (name)), ENOENT);
+
+	/* Buffer too small for the key. */
+	unit_err(zap_value_search_by_dnode(dn, 3, 0, name, 2), ENAMETOOLONG);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* zap_value_search: value masks */
+static MunitResult
+test_zap_value_search_mask(const MunitParameter params[], void *data)
+{
+	(void) data;
+
+	dnode_t *dn = mock_zap_create_params(params, "type");
+	dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create();
+
+	/*
+	 * Add a set of values. These all have the same bottom 16 bits, with
+	 * different upper 48 bits, segmented so we can mask them in different
+	 * and interesting ways.
+	 */
+	uint64_t v1 = 0x000000000000f0f0ull;
+	uint64_t v2 = 0x00000000fffff0f0ull;
+	uint64_t v3 = 0x0000ffff0000f0f0ull;
+	uint64_t v4 = 0xffff00000000f0f0ull;
+
+	/*
+	 * Generate four random keys. We do this because zap_value_search() is
+	 * implemented with a simple cursor walk, so will always return the
+	 * first match in hash order, which with fixed keys will always give
+	 * exactly the same results. Using random keys ensures the test values
+	 * are encountered in different orders between test runs, giving us
+	 * better coverage when there are multiple matches.
+	 */
+
+	char k1[9], k2[9], k3[9], k4[9];
+	unit_rand_str(k1, sizeof (k1));
+	unit_rand_str(k2, sizeof (k2));
+	unit_rand_str(k3, sizeof (k3));
+	unit_rand_str(k4, sizeof (k4));
+
+	unit_ok(zap_add_by_dnode(dn, k1, sizeof (uint64_t), 1, &v1, tx));
+	unit_ok(zap_add_by_dnode(dn, k2, sizeof (uint64_t), 1, &v2, tx));
+	unit_ok(zap_add_by_dnode(dn, k3, sizeof (uint64_t), 1, &v3, tx));
+	unit_ok(zap_add_by_dnode(dn, k4, sizeof (uint64_t), 1, &v4, tx));
+
+	char name[ZAP_MAXNAMELEN];
+
+	/* 0 mask is equivalent to all bits set in mask ie exact match. */
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0xf0f0, 0, name, sizeof (name)));
+	unit_str_eq(name, k1);
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0xf0f0, 0xffffffffffffffffull, name, sizeof (name)));
+	unit_str_eq(name, k1);
+
+	/* Low 16 bits could match any. */
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0xf0f0, 0xffff, name, sizeof (name)));
+
+	/* Low 32 bits, 3/1 matches. */
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0x0000f0f0, 0xffffffff, name, sizeof (name)));
+	unit_true(strcmp(name, k1) == 0 || strcmp(name, k3) == 0 ||
+	    strcmp(name, k4) == 0);
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0xfffff0f0, 0xffffffff, name, sizeof (name)));
+	unit_str_eq(name, k2);
+
+	/* Low 48 bits, 2/1/1 matches */
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0x00000000f0f0ull, 0xffffffffffffull, name, sizeof (name)));
+	unit_true(strcmp(name, k1) == 0 || strcmp(name, k4) == 0);
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0x0000fffff0f0ull, 0xffffffffffffull, name, sizeof (name)));
+	unit_str_eq(name, k2);
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0xffff0000f0f0ull, 0xffffffffffffull, name, sizeof (name)));
+	unit_str_eq(name, k3);
+
+	/* Value doesn't exist directly, but matches when mask applied. */
+	unit_ok(zap_value_search_by_dnode(dn,
+	    0xffffffff, 0xffff0000, name, sizeof (name)));
+	unit_str_eq(name, k2);
+
+	mock_tx_destroy((mock_dmu_tx_t *)tx);
+	unit_true(mock_zap_is_params(dn, params, "type"));
+	mock_zap_destroy(dn);
+
+	return (MUNIT_OK);
+}
+
+/* ========== */
+
 /* Test suite definition and boilerplate. */
 
 #define	UNIT_PARAM_ZAP_TYPES(p)	\
@@ -1017,6 +1139,11 @@ static const MunitTest zap_tests[] = {
 	UNIT_TEST_ZAP_TYPES(
 	    "cursor_release_one",	test_cursor_release_one),
 
+	UNIT_TEST_ZAP_TYPES(
+	    "zap_value_search",		test_zap_value_search),
+	UNIT_TEST_ZAP_TYPES(
+	    "zap_value_search_mask",	test_zap_value_search_mask),
+
 	{ 0 },
 };
 

From a8ef128da21fdcdbf0617e69b79a0c4843b4fb20 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 5 Jun 2026 14:02:47 -0700
Subject: [PATCH 127/129] Fix uninitialized variable warning in zil_parse()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This resolves the following possible uninitialized variable warning
when building with --enable-code-coverage and gcc 8.5.0.

    module/zfs/zil.c: In function ‘zil_parse’:
    module/zfs/zil.c:549:47: warning: ‘end’ may be used uninitialized
    in this function [-Wmaybe-uninitialized]

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18633
---
 module/zfs/zil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 0fa58d5ccb6..433d27dd2d1 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -499,7 +499,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
-		char *lrp, *end;
+		char *lrp = NULL, *end = NULL;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)

From b35bf7e7c63c4ca75e664dffbedb03009dd06f92 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Sat, 6 Jun 2026 10:15:11 -0700
Subject: [PATCH 128/129] ZTS: relax zpool_import_parallel_pos.ksh timing

Occasionally in the CI this test will fail because the parallel import
took longer than half of the serial time (but still less than the full
serial time).  Increase the cutoff to 3/4 of the serial time to preserve
the intent yet try and avoid these false positive failures.

Reviewed-by: Chris Longros <chris.longros@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #18634
---
 .../cli_root/zpool_import/zpool_import_parallel_pos.ksh       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
index 60088e6dd97..be3344326e9 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
@@ -114,7 +114,7 @@ wait
 parallel_time=$SECONDS
 log_note "asyncronously imported 4 pools in $parallel_time seconds"
 
-log_must test $parallel_time -lt $(($sequential_time / 2))
+log_must test $parallel_time -lt $(($sequential_time * 3 / 4))
 
 #
 # export pools with import delay injectors
@@ -133,6 +133,6 @@ log_must zpool import -a -d $DEVICE_DIR -f
 parallel_time=$SECONDS
 log_note "asyncronously imported 4 pools in $parallel_time seconds"
 
-log_must test $parallel_time -lt $(($sequential_time / 2))
+log_must test $parallel_time -lt $(($sequential_time * 3 / 4))
 
 log_pass "Pool imports occur in parallel"

From a170134febea405c6b6f5ed51724cdcfb6d8e726 Mon Sep 17 00:00:00 2001
From: Christos Longros <98426896+chrislongros@users.noreply.github.com>
Date: Sat, 6 Jun 2026 19:16:23 +0200
Subject: [PATCH 129/129] metaslab: expose df_alloc_threshold and df_free_pct
 on Linux

Expose metaslab_df_alloc_threshold and metaslab_df_free_pct as module
parameters on Linux, matching their existing FreeBSD sysctls.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christos Longros <chris.longros@gmail.com>
Closes #18632
---
 man/man4/zfs.4                    | 12 ++++++++++++
 module/os/freebsd/zfs/sysctl_os.c | 26 --------------------------
 module/zfs/metaslab.c             |  8 ++++++++
 3 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 09195b03e1a..9967d9af739 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -390,6 +390,18 @@ this is
 or
 .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 .
 .
+.It Sy metaslab_df_alloc_threshold Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64
+Minimum size which forces the dynamic allocator to change its allocation
+strategy.
+Once the space map cannot satisfy an allocation of this size, it switches to a
+more aggressive strategy (searching by size rather than offset).
+.
+.It Sy metaslab_df_free_pct Ns = Ns Sy 4 Ns % Pq uint
+The minimum free space, in percent, which must be available in a space map to
+continue allocations in a first-fit fashion.
+Once free space drops below this level, allocations switch to a best-fit
+strategy.
+.
 .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If not searching forward (due to
 .Sy metaslab_df_max_search , metaslab_df_free_pct ,
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index a0a721aec20..32f2db739ce 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -289,32 +289,6 @@ param_set_active_allocator(SYSCTL_HANDLER_ARGS)
 	return (param_set_active_allocator_common(buf));
 }
 
-/*
- * Minimum size which forces the dynamic allocator to change
- * it's allocation strategy.  Once the space map cannot satisfy
- * an allocation of this size then it switches to using more
- * aggressive strategy (i.e search by size rather than offset).
- */
-extern uint64_t metaslab_df_alloc_threshold;
-
-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold,
-	CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0,
-	"Minimum size which forces the dynamic allocator to change its"
-	" allocation strategy");
-
-/*
- * The minimum free space, in percent, which must be available
- * in a space map to continue allocations in a first-fit fashion.
- * Once the space map's free space drops below this level we dynamically
- * switch to using best-fit allocations.
- */
-extern uint_t metaslab_df_free_pct;
-
-SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
-	CTLFLAG_RWTUN, &metaslab_df_free_pct, 0,
-	"The minimum free space, in percent, which must be available in a"
-	" space map to continue allocations in a first-fit fashion");
-
 /* mmp.c */
 
 int
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 959aa1b8384..2be1f281268 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -6443,6 +6443,14 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 	"When looking in size tree, use largest segment instead of exact fit");
 
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_alloc_threshold, U64, ZMOD_RW,
+	"Minimum size which forces the dynamic allocator to change its "
+	"allocation strategy");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_free_pct, UINT, ZMOD_RW,
+	"The minimum free space, in percent, to continue allocations in a "
+	"first-fit fashion");
+
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");