zap: lift and simplify zap_t lock upgrade

Most fatzap write ops only take the READER zap_t lock, because the
header block only needs to be updated when a change would add or remove
a leaf block or spill the ptrtbl. When this happens, the lock is
upgraded to WRITER so those changes can be made.

If the lock can't be upgraded directly (not least because
rw_tryupgrade() is a no-op on Linux and userspace), then it has to be
dropped and re-acquired, that is, zap_unlock() and then zap_lock().

However, this method is far heavier than it needs to be, and adds
complication because it fully releases the zap_t, the header dbuf and
the dnode. This gives a window where the dbuf can be evicted and so the
zap_t destroyed. In addition to the IO overhead if this happens, this
means the zap_t returned by zap_lock() may be different to the original,
which means all callers need to be prepared for it to change.

zap_shrink() used an alternate method of simply dropping and reacquiring
zap_rwlock rather than fully destroying everything. The comment there
says it was only done because of lack of a refcount tag for unlock/lock,
but this is actually a better general technique, as the zap_t is
guaranteed to remain alive because its owning dbuf is never released and
so can will not be evicted.

So, this commit lifts the old zap_tryupgradedir() to
zap_lock_try_upgrade(), and adds a potentially-blocking variant
zap_lock_upgrade() that drops and retakes the rwlock. Everything is
switched to use them, which vastly simplifies the surrounding code.
Because the zap_t, dbuf and dnode are never dropped, there's no way for
the upgrade operation to fail, and so the callers never have to deal
with the zap_t changing under them.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18546
This commit is contained in:
Rob Norris
2026-05-14 21:36:07 +10:00
committed by Brian Behlendorf
parent 18d910bd2c
commit c8f9b4c4da
5 changed files with 62 additions and 61 deletions
+12
View File
@@ -258,6 +258,18 @@ int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
/* Unlock and release a zap_t. */
void zap_unlock(zap_t *zap, const void *tag);
/*
* Try to upgrade a zap lock from READER to WRITER. If the upgrade is not
* possible without blocking, returns 0. If the upgrade happened, returns 1.
*/
int zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx);
/*
* Upgrade a zap lock from READER to WRITER. If it can't be upgraded
* immediately it will block.
*/
void zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx);
/* zap_t release function for when associated dbuf is evicted. */
void zap_evict_sync(void *dbu);
-10
View File
@@ -501,7 +501,6 @@ zap_add_impl(zap_t *zap, const char *key,
}
if (!zap->zap_ismicro) {
err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(key) >= MZAP_NAME_LEN ||
!mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
@@ -510,7 +509,6 @@ zap_add_impl(zap_t *zap, const char *key,
err = fzap_add(zn, integer_size, num_integers, val,
tag, tx);
}
zap = zn->zn_zap; /* fzap_add() may change zap */
} else {
zfs_btree_index_t idx;
if (mze_find(zn, &idx) != NULL) {
@@ -521,7 +519,6 @@ zap_add_impl(zap_t *zap, const char *key,
}
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlock(zap, tag);
return (err);
}
@@ -573,9 +570,7 @@ zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
return (SET_ERROR(ENOTSUP));
}
err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlock(zap, tag);
return (err);
}
@@ -635,7 +630,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
if (!zap->zap_ismicro) {
err = fzap_update(zn, integer_size, num_integers, val,
FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
@@ -646,7 +640,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
err = fzap_update(zn, integer_size, num_integers,
val, FTAG, tx);
}
zap = zn->zn_zap; /* fzap_update() may change zap */
} else {
zfs_btree_index_t idx;
mzap_ent_t *mze = mze_find(zn, &idx);
@@ -658,7 +651,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
}
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlock(zap, FTAG);
return (err);
}
@@ -678,9 +670,7 @@ zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
return (SET_ERROR(ENOTSUP));
}
err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlock(zap, tag);
return (err);
}
+9 -46
View File
@@ -25,6 +25,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
* Copyright (c) 2023, Klara Inc.
* Copyright (c) 2026, TrueNAS.
*/
/*
@@ -157,18 +158,6 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
dmu_buf_rele(db, FTAG);
}
static int
zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
{
if (RW_WRITE_HELD(&zap->zap_rwlock))
return (1);
if (rw_tryupgrade(&zap->zap_rwlock)) {
dmu_buf_will_dirty(zap->zap_dbuf, tx);
return (1);
}
return (0);
}
/*
* Generic routines for dealing with the pointer & cookie tables.
*/
@@ -711,6 +700,7 @@ static int
zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
{
(void) tag;
zap_t *zap = zn->zn_zap;
uint64_t hash = zn->zn_hash;
int err;
@@ -722,21 +712,13 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
zap_leaf_phys(l)->l_hdr.lh_prefix);
if (zap_tryupgradedir(zap, tx) == 0 ||
if (zap_lock_try_upgrade(zap, tx) == 0 ||
old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
/* We failed to upgrade, or need to grow the pointer table */
objset_t *os = zap->zap_objset;
uint64_t object = zap->zap_object;
zap_put_leaf(l);
*lp = l = NULL;
zap_unlock(zap, tag);
err = zap_lock(os, object, tx, RW_WRITER,
FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
if (err != 0)
return (err);
ASSERT(!zap->zap_ismicro);
zap_lock_upgrade(zap, tx);
while (old_prefix_len ==
zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
@@ -801,6 +783,7 @@ static void
zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
const void *tag, dmu_tx_t *tx)
{
(void) tag;
zap_t *zap = zn->zn_zap;
int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
@@ -813,17 +796,7 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
* We are in the middle of growing the pointer table, or
* this leaf will soon make us grow it.
*/
if (zap_tryupgradedir(zap, tx) == 0) {
objset_t *os = zap->zap_objset;
uint64_t zapobj = zap->zap_object;
zap_unlock(zap, tag);
int err = zap_lock(os, zapobj, tx,
RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
if (err != 0)
return;
}
zap_lock_upgrade(zap, tx);
/* could have finished growing while our locks were down */
if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
@@ -946,7 +919,6 @@ fzap_add_cd(zap_name_t *zn,
zap_increment_num_entries(zap, 1, tx);
} else if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
}
@@ -1009,7 +981,6 @@ fzap_update(zap_name_t *zn,
if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
}
@@ -1392,22 +1363,14 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
* If there two empty sibling, we have work to do, so
* we need to lock ZAP ptrtbl as WRITER.
*/
if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
if (!writer && (writer = zap_lock_try_upgrade(zap, tx)) == 0) {
/* We failed to upgrade */
if (l != NULL) {
zap_put_leaf(l);
l = NULL;
}
/*
* Usually, the right way to upgrade from a READER lock
* to a WRITER lock is to call zap_unlock() and
* zap_lock(), but we do not have a tag. Instead,
* we do it in more sophisticated way.
*/
rw_exit(&zap->zap_rwlock);
rw_enter(&zap->zap_rwlock, RW_WRITER);
dmu_buf_will_dirty(zap->zap_dbuf, tx);
zap_lock_upgrade(zap, tx);
zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
writer = B_TRUE;
+37
View File
@@ -431,6 +431,43 @@ zap_unlock(zap_t *zap, const void *tag)
dmu_buf_rele(zap->zap_dbuf, tag);
}
int
zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx)
{
if (RW_WRITE_HELD(&zap->zap_rwlock))
/* Already have writer, nothing to do. */
return (1);
/* Try to upgrade the lock in-place. */
if (rw_tryupgrade(&zap->zap_rwlock)) {
/*
* Got it, mark buffer dirty, since we only do that in
* zap_lock_impl() for writer.
*/
dmu_buf_will_dirty(zap->zap_dbuf, tx);
return (1);
}
return (0);
}
void
zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx)
{
if (zap_lock_try_upgrade(zap, tx))
return;
/*
* It's safe to drop the lock here because we still have a hold on
* zap_dbuf, which prevents the dbuf being evicted and the zap_t being
* deallocated.
*/
rw_exit(&zap->zap_rwlock);
rw_enter(&zap->zap_rwlock, RW_WRITER);
dmu_buf_will_dirty(zap->zap_dbuf, tx);
}
void
zap_evict_sync(void *dbu)
{
-1
View File
@@ -363,7 +363,6 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
/* If we fail here, we would end up losing entries */
VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
tag, tx));
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
}
zap_name_free(zn);
vmem_free(mzp, sz);