zap: add zap_cursor_init_by_dnode() & rework cursor resource lifetime

This commit adds zap_cursor_init_by_dnode() (and
zap_cursor_init_serialized_by_dnode()), which allow the target ZAP to
provided via an existing dnode rather than the traditional objset+object
pair.

This requires some reorganisation of the way that zap_cursor_t is
initialised. Up until now, zap_cursor_init() has merely stored the
objset, object, serialized form and prefetch flag, and left it until
zap_cursor_retrieve() to actually call zap_lock(). This makes a
_by_dnode() form complicated, because it is a held resource that needs
to be released, but might not be used if zap_cursor_retrieve() is not
called. So there's a bunch of state tracking required.

However, all cursor users immediately follow zap_cursor_init() with
zap_cursor_retrieve(), so there's nothing gained by delaying holds. This
allows us to simplify things, by calling zap_lock() directly in
zap_cursor_init() and retaining it until zap_cursor_fini().

This does however means the _init() functions are now fallible, and can
return an error. This adds complexity to most of the call sites, which
are typically in a for loop of the form:

    for (zap_cursor_init(...);
      zap_cursor_retrieve(...) == 0;
      zap_cursor_advance(...))

To avoid needing to make significant changes at every call site, a
failed _init() call will also zero the cursor struct. If the caller
doesn't check the return and continues to zap_cursor_retrieve(), they
will get an EIO return, and zap_cursor_fini() will just return.

The existing zc_objset and zc_zapobj fields are retained to support
source backcompat for Lustre, which inspects them directly.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603
This commit is contained in:
Rob Norris
2026-05-10 14:02:35 +10:00
committed by Brian Behlendorf
parent 68980eb105
commit efda1093ff
2 changed files with 88 additions and 49 deletions
+13 -6
View File
@@ -443,16 +443,20 @@ void zap_attribute_free(zap_attribute_t *attrp);
struct zap;
struct zap_leaf;
typedef struct zap_cursor {
/* This structure is opaque! */
objset_t *zc_objset;
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_zapobj;
uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
boolean_t zc_prefetch;
/*
* Legacy fields to main source compat with Lustre, which accesses
* them directly. Not to be used in new code!
*/
objset_t *zc_objset;
uint64_t zc_zapobj;
} zap_cursor_t;
/*
@@ -460,14 +464,15 @@ typedef struct zap_cursor {
* The entire zapobj will be prefetched. You must call zap_cursor_fini the
* cursor when you are done with it.
*/
void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
int zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn);
void zap_cursor_fini(zap_cursor_t *zc);
/*
* Initialize a cursor at the beginning, but request that we not prefetch
* the entire ZAP object.
*/
void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj);
/*
@@ -477,8 +482,10 @@ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
* zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
* zap_cursor_init(...).)
*/
void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,
int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj, uint64_t serialized);
int zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,
uint64_t serialized);
/*
* Get the attribute currently pointed to by the cursor. Returns
+75 -43
View File
@@ -1072,53 +1072,100 @@ zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
/* zap_cursor */
static void
zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
static int
zap_cursor_init_by_dnode_impl(zap_cursor_t *zc, dnode_t *dn,
uint64_t serialized, boolean_t prefetch)
{
zc->zc_objset = os;
zc->zc_zap = NULL;
zc->zc_leaf = NULL;
zc->zc_zapobj = zapobj;
zc->zc_serialized = serialized;
zc->zc_hash = 0;
zc->zc_cd = 0;
int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
zc, &zc->zc_zap);
if (err != 0)
return (err);
zc->zc_prefetch = prefetch;
zc->zc_objset = dn->dn_objset;
zc->zc_zapobj = dn->dn_object;
int hb = zap_hashbits(zc->zc_zap);
zc->zc_hash = serialized << (64 - hb);
zc->zc_cd = serialized >> hb;
if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
zc->zc_cd = 0;
/*
* Drop ZAP read lock, but keep the hold, so the holds on the
* underlying dnode and header dbuf are maintained.
*/
rw_exit(&zc->zc_zap->zap_rwlock);
return (0);
}
void
static int
zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized, uint32_t prefetch)
{
dnode_t *dn = NULL;
int err = dnode_hold(os, zapobj, FTAG, &dn);
if (err != 0) {
zc->zc_zap = NULL;
zc->zc_leaf = NULL;
return (err);
}
err = zap_cursor_init_by_dnode_impl(zc, dn, serialized, prefetch);
dnode_rele(dn, FTAG);
return (err);
}
int
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
return (zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE));
}
void
int
zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn)
{
return (zap_cursor_init_by_dnode_impl(zc, dn, 0, B_TRUE));
}
int
zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
return (zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE));
}
void
int
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
{
zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
return (zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE));
}
int
zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,
uint64_t serialized)
{
return (zap_cursor_init_by_dnode_impl(zc, dn, serialized, B_TRUE));
}
void
zap_cursor_fini(zap_cursor_t *zc)
{
if (zc->zc_zap) {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
zap_unlock(zc->zc_zap, NULL);
zc->zc_zap = NULL;
}
if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
zap_put_leaf(zc->zc_leaf);
zc->zc_leaf = NULL;
}
zc->zc_objset = NULL;
if (zc->zc_zap) {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
zap_unlock(zc->zc_zap, zc);
}
memset(zc, 0, sizeof (zap_cursor_t));
}
int
@@ -1126,30 +1173,15 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
{
int err;
if (zc->zc_zap == NULL)
/* zap_cursor_init failed, cursor is invalid */
return (SET_ERROR(EIO));
if (zc->zc_hash == -1ULL)
return (SET_ERROR(ENOENT));
if (zc->zc_zap == NULL) {
int hb;
err = zap_lock(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
if (err != 0)
return (err);
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
/*
* To support zap_cursor_init_serialized, advance, retrieve,
* we must add to the existing zc_cd, which may already
* be 1 due to the zap_cursor_advance.
*/
ASSERT0(zc->zc_hash);
hb = zap_hashbits(zc->zc_zap);
zc->zc_hash = zc->zc_serialized << (64 - hb);
zc->zc_cd += zc->zc_serialized >> hb;
if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
zc->zc_cd = 0;
} else {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
}
if (!zc->zc_zap->zap_ismicro) {
err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
} else {
@@ -1184,6 +1216,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
err = SET_ERROR(ENOENT);
}
}
rw_exit(&zc->zc_zap->zap_rwlock);
return (err);
}
@@ -1199,10 +1232,9 @@ zap_cursor_advance(zap_cursor_t *zc)
uint64_t
zap_cursor_serialize(zap_cursor_t *zc)
{
if (zc->zc_hash == -1ULL)
if (zc->zc_zap == NULL || zc->zc_hash == -1ULL)
return (-1ULL);
if (zc->zc_zap == NULL)
return (zc->zc_serialized);
ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));
ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));