From c8f9b4c4da44e7ef19be91510f36b8f7acdab4aa Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@truenas.com>
Date: Thu, 14 May 2026 21:36:07 +1000
Subject: [PATCH] zap: lift and simplify zap_t lock upgrade

Most fatzap write ops only take the READER zap_t lock, because the
header block only needs to be updated when a change would add or remove
a leaf block or spill the ptrtbl. When this happens, the lock is
upgraded to WRITER so those changes can be made.

If the lock can't be upgraded directly (not least because
rw_tryupgrade() is a no-op on Linux and userspace), then it has to be
dropped and re-acquired, that is, zap_unlock() and then zap_lock().

However, this method is far heavier than it needs to be, and adds
complication because it fully releases the zap_t, the header dbuf and
the dnode. This gives a window where the dbuf can be evicted and so the
zap_t destroyed. In addition to the IO overhead if this happens, this
means the zap_t returned by zap_lock() may be different to the original,
which means all callers need to be prepared for it to change.

zap_shrink() used an alternate method of simply dropping and reacquiring
zap_rwlock rather than fully destroying everything. The comment there
says it was only done because of lack of a refcount tag for unlock/lock,
but this is actually a better general technique, as the zap_t is
guaranteed to remain alive because its owning dbuf is never released and
so can will not be evicted.

So, this commit lifts the old zap_tryupgradedir() to
zap_lock_try_upgrade(), and adds a potentially-blocking variant
zap_lock_upgrade() that drops and retakes the rwlock. Everything is
switched to use them, which vastly simplifies the surrounding code.
Because the zap_t, dbuf and dnode are never dropped, there's no way for
the upgrade operation to fail, and so the callers never have to deal
with the zap_t changing under them.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
---
 include/sys/zap_impl.h | 12 +++++++++
 module/zfs/zap.c       | 18 +++-----------
 module/zfs/zap_fat.c   | 55 +++++++-----------------------------------
 module/zfs/zap_impl.c  | 37 ++++++++++++++++++++++++++++
 module/zfs/zap_micro.c |  1 -
 5 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index d985a5a0294..b4f30405eba 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -258,6 +258,18 @@ int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
 /* Unlock and release a zap_t. */
 void zap_unlock(zap_t *zap, const void *tag);
 
+/*
+ * Try to upgrade a zap lock from READER to WRITER. If the upgrade is not
+ * possible without blocking, returns 0. If the upgrade happened, returns 1.
+ */
+int zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+/*
+ * Upgrade a zap lock from READER to WRITER. If it can't be upgraded
+ * immediately it will block.
+ */
+void zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx);
+
 /* zap_t release function for when associated dbuf is evicted. */
 void zap_evict_sync(void *dbu);
 
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index f319469e55e..8e3cf7e1c47 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -501,7 +501,6 @@ zap_add_impl(zap_t *zap, const char *key,
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN ||
 	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
@@ -510,7 +509,6 @@ zap_add_impl(zap_t *zap, const char *key,
 			err = fzap_add(zn, integer_size, num_integers, val,
 			    tag, tx);
 		}
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		zfs_btree_index_t idx;
 		if (mze_find(zn, &idx) != NULL) {
@@ -521,8 +519,7 @@ zap_add_impl(zap_t *zap, const char *key,
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlock(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -573,10 +570,8 @@ zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlock(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
@@ -635,7 +630,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	if (!zap->zap_ismicro) {
 		err = fzap_update(zn, integer_size, num_integers, val,
 		    FTAG, tx);
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
@@ -646,7 +640,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, FTAG, tx);
 		}
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t *mze = mze_find(zn, &idx);
@@ -658,8 +651,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlock(zap, FTAG);
+	zap_unlock(zap, FTAG);
 	return (err);
 }
 
@@ -678,10 +670,8 @@ zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
-	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlock(zap, tag);
+	zap_unlock(zap, tag);
 	return (err);
 }
 
diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c
index 2d068f64671..6cf773e786e 100644
--- a/module/zfs/zap_fat.c
+++ b/module/zfs/zap_fat.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
  * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2026, TrueNAS.
  */
 
 /*
@@ -157,18 +158,6 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	dmu_buf_rele(db, FTAG);
 }
 
-static int
-zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
-{
-	if (RW_WRITE_HELD(&zap->zap_rwlock))
-		return (1);
-	if (rw_tryupgrade(&zap->zap_rwlock)) {
-		dmu_buf_will_dirty(zap->zap_dbuf, tx);
-		return (1);
-	}
-	return (0);
-}
-
 /*
  * Generic routines for dealing with the pointer & cookie tables.
  */
@@ -711,6 +700,7 @@ static int
 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
     const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
+	(void) tag;
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
 	int err;
@@ -722,21 +712,13 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-	if (zap_tryupgradedir(zap, tx) == 0 ||
+	if (zap_lock_try_upgrade(zap, tx) == 0 ||
 	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 		/* We failed to upgrade, or need to grow the pointer table */
-		objset_t *os = zap->zap_objset;
-		uint64_t object = zap->zap_object;
-
 		zap_put_leaf(l);
 		*lp = l = NULL;
-		zap_unlock(zap, tag);
-		err = zap_lock(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, tag, &zn->zn_zap);
-		zap = zn->zn_zap;
-		if (err != 0)
-			return (err);
-		ASSERT(!zap->zap_ismicro);
+
+		zap_lock_upgrade(zap, tx);
 
 		while (old_prefix_len ==
 		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
@@ -801,6 +783,7 @@ static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
     const void *tag, dmu_tx_t *tx)
 {
+	(void) tag;
 	zap_t *zap = zn->zn_zap;
 	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
@@ -813,17 +796,7 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
 		 * We are in the middle of growing the pointer table, or
 		 * this leaf will soon make us grow it.
 		 */
-		if (zap_tryupgradedir(zap, tx) == 0) {
-			objset_t *os = zap->zap_objset;
-			uint64_t zapobj = zap->zap_object;
-
-			zap_unlock(zap, tag);
-			int err = zap_lock(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
-			zap = zn->zn_zap;
-			if (err != 0)
-				return;
-		}
+		zap_lock_upgrade(zap, tx);
 
 		/* could have finished growing while our locks were down */
 		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
@@ -946,7 +919,6 @@ fzap_add_cd(zap_name_t *zn,
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
@@ -1009,7 +981,6 @@ fzap_update(zap_name_t *zn,
 
 	if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
@@ -1392,22 +1363,14 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 		 * If there two empty sibling, we have work to do, so
 		 * we need to lock ZAP ptrtbl as WRITER.
 		 */
-		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+		if (!writer && (writer = zap_lock_try_upgrade(zap, tx)) == 0) {
 			/* We failed to upgrade */
 			if (l != NULL) {
 				zap_put_leaf(l);
 				l = NULL;
 			}
 
-			/*
-			 * Usually, the right way to upgrade from a READER lock
-			 * to a WRITER lock is to call zap_unlock() and
-			 * zap_lock(), but we do not have a tag. Instead,
-			 * we do it in more sophisticated way.
-			 */
-			rw_exit(&zap->zap_rwlock);
-			rw_enter(&zap->zap_rwlock, RW_WRITER);
-			dmu_buf_will_dirty(zap->zap_dbuf, tx);
+			zap_lock_upgrade(zap, tx);
 
 			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 			writer = B_TRUE;
diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c
index c05985c0adb..b72aa82349e 100644
--- a/module/zfs/zap_impl.c
+++ b/module/zfs/zap_impl.c
@@ -431,6 +431,43 @@ zap_unlock(zap_t *zap, const void *tag)
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
+int
+zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	if (RW_WRITE_HELD(&zap->zap_rwlock))
+		/* Already have writer, nothing to do. */
+		return (1);
+
+	/* Try to upgrade the lock in-place. */
+	if (rw_tryupgrade(&zap->zap_rwlock)) {
+		/*
+		 * Got it, mark buffer dirty, since we only do that in
+		 * zap_lock_impl() for writer.
+		 */
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+		return (1);
+	}
+
+	return (0);
+}
+
+void
+zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	if (zap_lock_try_upgrade(zap, tx))
+		return;
+
+	/*
+	 * It's safe to drop the lock here because we still have a hold on
+	 * zap_dbuf, which prevents the dbuf being evicted and the zap_t being
+	 * deallocated.
+	 */
+	rw_exit(&zap->zap_rwlock);
+
+	rw_enter(&zap->zap_rwlock, RW_WRITER);
+	dmu_buf_will_dirty(zap->zap_dbuf, tx);
+}
+
 void
 zap_evict_sync(void *dbu)
 {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 04956b005c8..727f72d999f 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -363,7 +363,6 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
 		/* If we fail here, we would end up losing entries */
 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
 		    tag, tx));
-		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 	}
 	zap_name_free(zn);
 	vmem_free(mzp, sz);