diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index d985a5a0294..b4f30405eba 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -258,6 +258,18 @@ int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx, /* Unlock and release a zap_t. */ void zap_unlock(zap_t *zap, const void *tag); +/* + * Try to upgrade a zap lock from READER to WRITER. If the upgrade is not + * possible without blocking, returns 0. If the upgrade happened, returns 1. + */ +int zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx); + +/* + * Upgrade a zap lock from READER to WRITER. If it can't be upgraded + * immediately it will block. + */ +void zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx); + /* zap_t release function for when associated dbuf is evicted. */ void zap_evict_sync(void *dbu); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index f319469e55e..8e3cf7e1c47 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -501,7 +501,6 @@ zap_add_impl(zap_t *zap, const char *key, } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN || !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { @@ -510,7 +509,6 @@ zap_add_impl(zap_t *zap, const char *key, err = fzap_add(zn, integer_size, num_integers, val, tag, tx); } - zap = zn->zn_zap; /* fzap_add() may change zap */ } else { zfs_btree_index_t idx; if (mze_find(zn, &idx) != NULL) { @@ -521,8 +519,7 @@ zap_add_impl(zap_t *zap, const char *key, } ASSERT(zap == zn->zn_zap); zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlock(zap, tag); + zap_unlock(zap, tag); return (err); } @@ -573,10 +570,8 @@ zap_add_uint64_impl(zap_t *zap, const uint64_t *key, return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlock(zap, tag); + zap_unlock(zap, tag); return (err); } @@ -635,7 +630,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", @@ -646,7 +640,6 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } - zap = zn->zn_zap; /* fzap_update() may change zap */ } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); @@ -658,8 +651,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } ASSERT(zap == zn->zn_zap); zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlock(zap, FTAG); + zap_unlock(zap, FTAG); return (err); } @@ -678,10 +670,8 @@ zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlock(zap, tag); + zap_unlock(zap, tag); return (err); } diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c index 2d068f64671..6cf773e786e 100644 --- a/module/zfs/zap_fat.c +++ b/module/zfs/zap_fat.c @@ -25,6 +25,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2023 Alexander Stetsenko * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2026, TrueNAS. */ /* @@ -157,18 +158,6 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) dmu_buf_rele(db, FTAG); } -static int -zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) -{ - if (RW_WRITE_HELD(&zap->zap_rwlock)) - return (1); - if (rw_tryupgrade(&zap->zap_rwlock)) { - dmu_buf_will_dirty(zap->zap_dbuf, tx); - return (1); - } - return (0); -} - /* * Generic routines for dealing with the pointer & cookie tables. */ @@ -711,6 +700,7 @@ static int zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, const void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { + (void) tag; zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; int err; @@ -722,21 +712,13 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); - if (zap_tryupgradedir(zap, tx) == 0 || + if (zap_lock_try_upgrade(zap, tx) == 0 || old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ - objset_t *os = zap->zap_objset; - uint64_t object = zap->zap_object; - zap_put_leaf(l); *lp = l = NULL; - zap_unlock(zap, tag); - err = zap_lock(os, object, tx, RW_WRITER, - FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return (err); - ASSERT(!zap->zap_ismicro); + + zap_lock_upgrade(zap, tx); while (old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { @@ -801,6 +783,7 @@ static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, const void *tag, dmu_tx_t *tx) { + (void) tag; zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && @@ -813,17 +796,7 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, * We are in the middle of growing the pointer table, or * this leaf will soon make us grow it. */ - if (zap_tryupgradedir(zap, tx) == 0) { - objset_t *os = zap->zap_objset; - uint64_t zapobj = zap->zap_object; - - zap_unlock(zap, tag); - int err = zap_lock(os, zapobj, tx, - RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return; - } + zap_lock_upgrade(zap, tx); /* could have finished growing while our locks were down */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) @@ -946,7 +919,6 @@ fzap_add_cd(zap_name_t *zn, zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } @@ -1009,7 +981,6 @@ fzap_update(zap_name_t *zn, if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } @@ -1392,22 +1363,14 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) * If there two empty sibling, we have work to do, so * we need to lock ZAP ptrtbl as WRITER. */ - if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { + if (!writer && (writer = zap_lock_try_upgrade(zap, tx)) == 0) { /* We failed to upgrade */ if (l != NULL) { zap_put_leaf(l); l = NULL; } - /* - * Usually, the right way to upgrade from a READER lock - * to a WRITER lock is to call zap_unlock() and - * zap_lock(), but we do not have a tag. Instead, - * we do it in more sophisticated way. - */ - rw_exit(&zap->zap_rwlock); - rw_enter(&zap->zap_rwlock, RW_WRITER); - dmu_buf_will_dirty(zap->zap_dbuf, tx); + zap_lock_upgrade(zap, tx); zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; writer = B_TRUE; diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c index c05985c0adb..b72aa82349e 100644 --- a/module/zfs/zap_impl.c +++ b/module/zfs/zap_impl.c @@ -431,6 +431,43 @@ zap_unlock(zap_t *zap, const void *tag) dmu_buf_rele(zap->zap_dbuf, tag); } +int +zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + /* Already have writer, nothing to do. */ + return (1); + + /* Try to upgrade the lock in-place. */ + if (rw_tryupgrade(&zap->zap_rwlock)) { + /* + * Got it, mark buffer dirty, since we only do that in + * zap_lock_impl() for writer. + */ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + + return (0); +} + +void +zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + if (zap_lock_try_upgrade(zap, tx)) + return; + + /* + * It's safe to drop the lock here because we still have a hold on + * zap_dbuf, which prevents the dbuf being evicted and the zap_t being + * deallocated. + */ + rw_exit(&zap->zap_rwlock); + + rw_enter(&zap->zap_rwlock, RW_WRITER); + dmu_buf_will_dirty(zap->zap_dbuf, tx); +} + void zap_evict_sync(void *dbu) { diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 04956b005c8..727f72d999f 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -363,7 +363,6 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx)); - zap = zn->zn_zap; /* fzap_add_cd() may change zap */ } zap_name_free(zn); vmem_free(mzp, sz);