From efda1093ffa89a7b165a776c3d30ea44a13ee361 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 10 May 2026 14:02:35 +1000 Subject: [PATCH] zap: add zap_cursor_init_by_dnode() & rework cursor resource lifetime This commit adds zap_cursor_init_by_dnode() (and zap_cursor_init_serialized_by_dnode()), which allow the target ZAP to provided via an existing dnode rather than the traditional objset+object pair. This requires some reorganisation of the way that zap_cursor_t is initialised. Up until now, zap_cursor_init() has merely stored the objset, object, serialized form and prefetch flag, and left it until zap_cursor_retrieve() to actually call zap_lock(). This makes a _by_dnode() form complicated, because it is a held resource that needs to be released, but might not be used if zap_cursor_retrieve() is not called. So there's a bunch of state tracking required. However, all cursor users immediately follow zap_cursor_init() with zap_cursor_retrieve(), so there's nothing gained by delaying holds. This allows us to simplify things, by calling zap_lock() directly in zap_cursor_init() and retaining it until zap_cursor_fini(). This does however means the _init() functions are now fallible, and can return an error. This adds complexity to most of the call sites, which are typically in a for loop of the form: for (zap_cursor_init(...); zap_cursor_retrieve(...) == 0; zap_cursor_advance(...)) To avoid needing to make significant changes at every call site, a failed _init() call will also zero the cursor struct. If the caller doesn't check the return and continues to zap_cursor_retrieve(), they will get an EIO return, and zap_cursor_fini() will just return. The existing zc_objset and zc_zapobj fields are retained to support source backcompat for Lustre, which inspects them directly. Sponsored-by: TrueNAS Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #18603 --- include/sys/zap.h | 19 +++++--- module/zfs/zap.c | 118 +++++++++++++++++++++++++++++----------------- 2 files changed, 88 insertions(+), 49 deletions(-) diff --git a/include/sys/zap.h b/include/sys/zap.h index 7e89ad7d3de..ad20d427ad9 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -443,16 +443,20 @@ void zap_attribute_free(zap_attribute_t *attrp); struct zap; struct zap_leaf; + typedef struct zap_cursor { /* This structure is opaque! */ - objset_t *zc_objset; struct zap *zc_zap; struct zap_leaf *zc_leaf; - uint64_t zc_zapobj; - uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; boolean_t zc_prefetch; + /* + * Legacy fields to main source compat with Lustre, which accesses + * them directly. Not to be used in new code! + */ + objset_t *zc_objset; + uint64_t zc_zapobj; } zap_cursor_t; /* @@ -460,14 +464,15 @@ typedef struct zap_cursor { * The entire zapobj will be prefetched. You must call zap_cursor_fini the * cursor when you are done with it. */ -void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); +int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); +int zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn); void zap_cursor_fini(zap_cursor_t *zc); /* * Initialize a cursor at the beginning, but request that we not prefetch * the entire ZAP object. */ -void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, +int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); /* @@ -477,8 +482,10 @@ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to * zap_cursor_init(...).) */ -void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, +int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized); +int zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn, + uint64_t serialized); /* * Get the attribute currently pointed to by the cursor. Returns diff --git a/module/zfs/zap.c b/module/zfs/zap.c index caed9c67794..ee94917d8e8 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -1072,53 +1072,100 @@ zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) /* zap_cursor */ -static void -zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, +static int +zap_cursor_init_by_dnode_impl(zap_cursor_t *zc, dnode_t *dn, uint64_t serialized, boolean_t prefetch) { - zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; - zc->zc_zapobj = zapobj; - zc->zc_serialized = serialized; - zc->zc_hash = 0; - zc->zc_cd = 0; + + int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + zc, &zc->zc_zap); + if (err != 0) + return (err); + zc->zc_prefetch = prefetch; + zc->zc_objset = dn->dn_objset; + zc->zc_zapobj = dn->dn_object; + + int hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = serialized << (64 - hb); + zc->zc_cd = serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; + + /* + * Drop ZAP read lock, but keep the hold, so the holds on the + * underlying dnode and header dbuf are maintained. + */ + rw_exit(&zc->zc_zap->zap_rwlock); + + return (0); } -void +static int +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, uint32_t prefetch) +{ + dnode_t *dn = NULL; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) { + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + return (err); + } + + err = zap_cursor_init_by_dnode_impl(zc, dn, serialized, prefetch); + + dnode_rele(dn, FTAG); + + return (err); +} + +int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { - zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); + return (zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE)); } -void +int +zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn) +{ + return (zap_cursor_init_by_dnode_impl(zc, dn, 0, B_TRUE)); +} + +int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { - zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); + return (zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE)); } -void +int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { - zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); + return (zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE)); +} + +int +zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn, + uint64_t serialized) +{ + return (zap_cursor_init_by_dnode_impl(zc, dn, serialized, B_TRUE)); } void zap_cursor_fini(zap_cursor_t *zc) { - if (zc->zc_zap) { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlock(zc->zc_zap, NULL); - zc->zc_zap = NULL; - } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; } - zc->zc_objset = NULL; + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlock(zc->zc_zap, zc); + } + memset(zc, 0, sizeof (zap_cursor_t)); } int @@ -1126,30 +1173,15 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; + if (zc->zc_zap == NULL) + /* zap_cursor_init failed, cursor is invalid */ + return (SET_ERROR(EIO)); + if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); - if (zc->zc_zap == NULL) { - int hb; - err = zap_lock(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); - if (err != 0) - return (err); + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - /* - * To support zap_cursor_init_serialized, advance, retrieve, - * we must add to the existing zc_cd, which may already - * be 1 due to the zap_cursor_advance. - */ - ASSERT0(zc->zc_hash); - hb = zap_hashbits(zc->zc_zap); - zc->zc_hash = zc->zc_serialized << (64 - hb); - zc->zc_cd += zc->zc_serialized >> hb; - if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ - zc->zc_cd = 0; - } else { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { @@ -1184,6 +1216,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) err = SET_ERROR(ENOENT); } } + rw_exit(&zc->zc_zap->zap_rwlock); return (err); } @@ -1199,10 +1232,9 @@ zap_cursor_advance(zap_cursor_t *zc) uint64_t zap_cursor_serialize(zap_cursor_t *zc) { - if (zc->zc_hash == -1ULL) + if (zc->zc_zap == NULL || zc->zc_hash == -1ULL) return (-1ULL); - if (zc->zc_zap == NULL) - return (zc->zc_serialized); + ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap))); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));