zap: add zap_cursor_init_by_dnode() & rework cursor resource lifetime

This commit adds zap_cursor_init_by_dnode() (and
zap_cursor_init_serialized_by_dnode()), which allow the target ZAP to
provided via an existing dnode rather than the traditional objset+object
pair.

This requires some reorganisation of the way that zap_cursor_t is
initialised. Up until now, zap_cursor_init() has merely stored the
objset, object, serialized form and prefetch flag, and left it until
zap_cursor_retrieve() to actually call zap_lock(). This makes a
_by_dnode() form complicated, because it is a held resource that needs
to be released, but might not be used if zap_cursor_retrieve() is not
called. So there's a bunch of state tracking required.

However, all cursor users immediately follow zap_cursor_init() with
zap_cursor_retrieve(), so there's nothing gained by delaying holds. This
allows us to simplify things, by calling zap_lock() directly in
zap_cursor_init() and retaining it until zap_cursor_fini().

This does however means the _init() functions are now fallible, and can
return an error. This adds complexity to most of the call sites, which
are typically in a for loop of the form:

    for (zap_cursor_init(...);
      zap_cursor_retrieve(...) == 0;
      zap_cursor_advance(...))

To avoid needing to make significant changes at every call site, a
failed _init() call will also zero the cursor struct. If the caller
doesn't check the return and continues to zap_cursor_retrieve(), they
will get an EIO return, and zap_cursor_fini() will just return.

The existing zc_objset and zc_zapobj fields are retained to support
source backcompat for Lustre, which inspects them directly.

Sponsored-by: TrueNAS
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@truenas.com>
Closes #18603

This commit is contained in:

Rob Norris

2026-05-10 14:02:35 +10:00

committed by

Brian Behlendorf

parent 68980eb105

commit efda1093ff

2 changed files with 88 additions and 49 deletions

									
										include/sys/zap.h
									
		+13
		-6
	
												View File
												
				@@ -443,16 +443,20 @@ void zap_attribute_free(zap_attribute_t *attrp);

				struct zap;

				struct zap_leaf;

				typedef struct zap_cursor {

					/* This structure is opaque! */

					objset_t *zc_objset;

					struct zap *zc_zap;

					struct zap_leaf *zc_leaf;

					uint64_t zc_zapobj;

					uint64_t zc_serialized;

					uint64_t zc_hash;

					uint32_t zc_cd;

					boolean_t zc_prefetch;

					/*

					 * Legacy fields to main source compat with Lustre, which accesses

					 * them directly. Not to be used in new code!

					 */

					objset_t *zc_objset;

					uint64_t zc_zapobj;

				} zap_cursor_t;

				/*

				@@ -460,14 +464,15 @@ typedef struct zap_cursor {

				 * The entire zapobj will be prefetched. You must call zap_cursor_fini the

				 * cursor when you are done with it.

				 */

				void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);

				int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);

				int zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn);

				void zap_cursor_fini(zap_cursor_t *zc);

				/*

				 * Initialize a cursor at the beginning, but request that we not prefetch

				 * the entire ZAP object.

				 */

				void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,

				int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,

				    uint64_t zapobj);

				/*

				@@ -477,8 +482,10 @@ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,

				 * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to

				 * zap_cursor_init(...).)

				 */

				void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,

				int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,

				    uint64_t zapobj, uint64_t serialized);

				int zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,

				    uint64_t serialized);

				/*

				 * Get the attribute currently pointed to by the cursor.  Returns

									
										module/zfs/zap.c
									
		+75
		-43
	
												View File
												
				@@ -1072,53 +1072,100 @@ zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)

				/* zap_cursor */

				static void

				zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,

				static int

				zap_cursor_init_by_dnode_impl(zap_cursor_t *zc, dnode_t *dn,

				    uint64_t serialized, boolean_t prefetch)

				{

					zc->zc_objset = os;

					zc->zc_zap = NULL;

					zc->zc_leaf = NULL;

					zc->zc_zapobj = zapobj;

					zc->zc_serialized = serialized;

					zc->zc_hash = 0;

					zc->zc_cd = 0;

					int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,

					    zc, &zc->zc_zap);

					if (err != 0)

						return (err);

					zc->zc_prefetch = prefetch;

					zc->zc_objset = dn->dn_objset;

					zc->zc_zapobj = dn->dn_object;

					int hb = zap_hashbits(zc->zc_zap);

					zc->zc_hash = serialized << (64 - hb);

					zc->zc_cd = serialized >> hb;

					if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */

						zc->zc_cd = 0;

					/*

					 * Drop ZAP read lock, but keep the hold, so the holds on the

					 * underlying dnode and header dbuf are maintained.

					 */

					rw_exit(&zc->zc_zap->zap_rwlock);

					return (0);

				}

				void

				static int

				zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,

				    uint64_t serialized, uint32_t prefetch)

				{

					dnode_t *dn = NULL;

					int err = dnode_hold(os, zapobj, FTAG, &dn);

					if (err != 0) {

						zc->zc_zap = NULL;

						zc->zc_leaf = NULL;

						return (err);

					}

					err = zap_cursor_init_by_dnode_impl(zc, dn, serialized, prefetch);

					dnode_rele(dn, FTAG);

					return (err);

				}

				int

				zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)

				{

					zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);

					return (zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE));

				}

				void

				int

				zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn)

				{

					return (zap_cursor_init_by_dnode_impl(zc, dn, 0, B_TRUE));

				}

				int

				zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)

				{

					zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);

					return (zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE));

				}

				void

				int

				zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,

				    uint64_t serialized)

				{

					zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);

					return (zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE));

				}

				int

				zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,

				    uint64_t serialized)

				{

					return (zap_cursor_init_by_dnode_impl(zc, dn, serialized, B_TRUE));

				}

				void

				zap_cursor_fini(zap_cursor_t *zc)

				{

					if (zc->zc_zap) {

						rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);

						zap_unlock(zc->zc_zap, NULL);

						zc->zc_zap = NULL;

					}

					if (zc->zc_leaf) {

						rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);

						zap_put_leaf(zc->zc_leaf);

						zc->zc_leaf = NULL;

					}

					zc->zc_objset = NULL;

					if (zc->zc_zap) {

						rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);

						zap_unlock(zc->zc_zap, zc);

					}

					memset(zc, 0, sizeof (zap_cursor_t));

				}

				int

				@@ -1126,30 +1173,15 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)

				{

					int err;

					if (zc->zc_zap == NULL)

						/* zap_cursor_init failed, cursor is invalid */

						return (SET_ERROR(EIO));

					if (zc->zc_hash == -1ULL)

						return (SET_ERROR(ENOENT));

					if (zc->zc_zap == NULL) {

						int hb;

						err = zap_lock(zc->zc_objset, zc->zc_zapobj, NULL,

						    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);

						if (err != 0)

							return (err);

					rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);

						/*

						 * To support zap_cursor_init_serialized, advance, retrieve,

						 * we must add to the existing zc_cd, which may already

						 * be 1 due to the zap_cursor_advance.

						 */

						ASSERT0(zc->zc_hash);

						hb = zap_hashbits(zc->zc_zap);

						zc->zc_hash = zc->zc_serialized << (64 - hb);

						zc->zc_cd += zc->zc_serialized >> hb;

						if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */

							zc->zc_cd = 0;

					} else {

						rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);

					}

					if (!zc->zc_zap->zap_ismicro) {

						err = fzap_cursor_retrieve(zc->zc_zap, zc, za);

					} else {

				@@ -1184,6 +1216,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)

							err = SET_ERROR(ENOENT);

						}

					}

					rw_exit(&zc->zc_zap->zap_rwlock);

					return (err);

				}

				@@ -1199,10 +1232,9 @@ zap_cursor_advance(zap_cursor_t *zc)

				uint64_t

				zap_cursor_serialize(zap_cursor_t *zc)

				{

					if (zc->zc_hash == -1ULL)

					if (zc->zc_zap == NULL || zc->zc_hash == -1ULL)

						return (-1ULL);

					if (zc->zc_zap == NULL)

						return (zc->zc_serialized);

					ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap)));

					ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));