From 8ff64005a2acc48fe18a5bfa3edb591376ce0bd4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 8 May 2026 21:31:13 +1000 Subject: [PATCH] zap: split implementation out into more files The ZAP code is mixed up across a few files without clear separation of concerns. This splits it out from three source files to five: - zap.c: the bulk of the "public" interface - zap_impl.c: internals shared across all backends - zap_micro.c: microzap backend - zap_fat.c: fatzap backend: core logic - zap_leaf.c: fatzap backend: leaf blocks Note that this doesn't not change any code, just moves functions around. Also note that right now the microzap and fatzap backends know more about each other than is healthy. This change is simply marking out where different things should live in the end, to make it easier for that refactoring work to begin. Sponsored-by: TrueNAS Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Akash B Signed-off-by: Rob Norris Closes #18516 --- include/sys/zap_impl.h | 23 +- lib/libzpool/Makefile.am | 2 + module/Kbuild.in | 2 + module/Makefile.bsd | 4 +- module/zfs/zap.c | 2511 ++++++++++++++++---------------------- module/zfs/zap_fat.c | 1502 +++++++++++++++++++++++ module/zfs/zap_impl.c | 527 ++++++++ module/zfs/zap_micro.c | 1602 +----------------------- 8 files changed, 3135 insertions(+), 3038 deletions(-) create mode 100644 module/zfs/zap_fat.c create mode 100644 module/zfs/zap_impl.c diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index d010c3c305c..78c57e522bc 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -203,17 +203,38 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp); +int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp); +int zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap, const void *tag); void zap_evict_sync(void *dbu); +zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, + int numints); zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt); +int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt); +zap_name_t * zap_name_alloc(zap_t *zap, boolean_t longname); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); uint64_t zap_getflags(zap_t *zap); +int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, + size_t outlen); +uint64_t zap_hash(zap_name_t *zn); uint64_t zap_get_micro_max_size(spa_t *spa); -#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) +zap_t *mzap_open(dmu_buf_t *db); +int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, + zap_flags_t flags); +mzap_ent_t *mze_find(zap_name_t *zn, zfs_btree_index_t *idx); +boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash); +void mze_destroy(zap_t *zap); +boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, + mzap_ent_t *mze, zfs_btree_index_t *idx); +void mzap_addent(zap_name_t *zn, uint64_t value); +void mzap_byteswap(mzap_phys_t *buf, size_t size); void fzap_byteswap(void *buf, size_t size); int fzap_count(zap_t *zap, uint64_t *count); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 8192553072f..05105407d52 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -166,6 +166,8 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev_root.c \ module/zfs/vdev_trim.c \ module/zfs/zap.c \ + module/zfs/zap_fat.c \ + module/zfs/zap_impl.c \ module/zfs/zap_leaf.c \ module/zfs/zap_micro.c \ module/zfs/zcp.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 47e739ea4d6..ff2c96b85ae 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -408,6 +408,8 @@ ZFS_OBJS := \ vdev_root.o \ vdev_trim.o \ zap.o \ + zap_fat.o \ + zap_impl.o \ zap_leaf.o \ zap_micro.o \ zcp.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 30cf741b965..96c3f3b2418 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -345,6 +345,8 @@ SRCS+= abd.c \ vdev_root.c \ vdev_trim.c \ zap.c \ + zap_fat.c \ + zap_impl.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ @@ -475,8 +477,8 @@ CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.zap_impl.c= -Wno-cast-qual CFLAGS.zap_leaf.c= -Wno-cast-qual -CFLAGS.zap_micro.c= -Wno-cast-qual CFLAGS.zcp.c= -Wno-cast-qual CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith diff --git a/module/zfs/zap.c b/module/zfs/zap.c index b40d765e342..fa3f8b836c9 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -19,1075 +19,116 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2023 Alexander Stetsenko - * Copyright (c) 2023, Klara Inc. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. */ -/* - * This file contains the top half of the zfs directory structure - * implementation. The bottom half is in zap_leaf.c. - * - * The zdir is an extendable hash data structure. There is a table of - * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are - * each a constant size and hold a variable number of directory entries. - * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. - * - * The pointer table holds a power of 2 number of pointers. - * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to - * by the pointer at index i in the table holds entries whose hash value - * has a zd_prefix_len - bit prefix - */ - -#include +#include #include #include -#include -#include -#include +#include #include #include #include -/* - * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object - * (all leaf blocks) when we start iterating over it. - * - * For zap_cursor_init(), the callers all intend to iterate through all the - * entries. There are a few cases where an error (typically i/o error) could - * cause it to bail out early. - * - * For zap_cursor_init_serialized(), there are callers that do the iteration - * outside of ZFS. Typically they would iterate over everything, but we - * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), - * zcp_snapshots_iter(), and other iterators over things in the MOS - these - * are called by /sbin/zfs and channel programs. The other example is - * zfs_readdir() which iterates over directory entries for the getdents() - * syscall. /sbin/ls iterates to the end (unless it receives a signal), but - * userland doesn't have to. - * - * Given that the ZAP entries aren't returned in a specific order, the only - * legitimate use cases for partial iteration would be: - * - * 1. Pagination: e.g. you only want to display 100 entries at a time, so you - * get the first 100 and then wait for the user to hit "next page", which - * they may never do). - * - * 2. You want to know if there are more than X entries, without relying on - * the zfs-specific implementation of the directory's st_size (which is - * the number of entries). - */ -static int zap_iterate_prefetch = B_TRUE; - -/* - * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be - * collapsed into a single block. - */ -int zap_shrink_enabled = B_TRUE; - -int fzap_default_block_shift = 14; /* 16k blocksize */ - -static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); -static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); - -void -fzap_byteswap(void *vbuf, size_t size) -{ - uint64_t block_type = *(uint64_t *)vbuf; - - if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) - zap_leaf_byteswap(vbuf, size); - else { - /* it's a ptrtbl block */ - byteswap_uint64_array(vbuf, size); - } -} - -void -fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) -{ - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - zap->zap_ismicro = FALSE; - - zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; - zap->zap_dbu.dbu_evict_func_async = NULL; - - mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); - zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; - - zap_phys_t *zp = zap_f_phys(zap); - /* - * explicitly zero it since it might be coming from an - * initialized microzap - */ - memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size); - zp->zap_block_type = ZBT_HEADER; - zp->zap_magic = ZAP_MAGIC; - - zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); - - zp->zap_freeblk = 2; /* block 1 will be the first leaf */ - zp->zap_num_leafs = 1; - zp->zap_num_entries = 0; - zp->zap_salt = zap->zap_salt; - zp->zap_normflags = zap->zap_normflags; - zp->zap_flags = flags; - - /* block 1 will be the first leaf */ - for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) - ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; - - /* - * set up block 1 - the first leaf - */ - dmu_buf_t *db; - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - 1<l_dbuf = db; - - zap_leaf_init(l, zp->zap_normflags != 0); - - kmem_free(l, sizeof (zap_leaf_t)); - dmu_buf_rele(db, FTAG); -} - -static int -zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) -{ - if (RW_WRITE_HELD(&zap->zap_rwlock)) - return (1); - if (rw_tryupgrade(&zap->zap_rwlock)) { - dmu_buf_will_dirty(zap->zap_dbuf, tx); - return (1); - } - return (0); -} - -/* - * Generic routines for dealing with the pointer & cookie tables. - */ - -static int -zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, - void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), - dmu_tx_t *tx) -{ - uint64_t newblk; - int bs = FZAP_BLOCK_SHIFT(zap); - int hepb = 1<<(bs-4); - /* hepb = half the number of entries in a block */ - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - ASSERT(tbl->zt_numblks > 0); - - if (tbl->zt_nextblk != 0) { - newblk = tbl->zt_nextblk; - } else { - newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); - tbl->zt_nextblk = newblk; - ASSERT0(tbl->zt_blks_copied); - dmu_prefetch_by_dnode(zap->zap_dnode, 0, - tbl->zt_blk << bs, tbl->zt_numblks << bs, - ZIO_PRIORITY_SYNC_READ); - } - - /* - * Copy the ptrtbl from the old to new location. - */ - - uint64_t b = tbl->zt_blks_copied; - dmu_buf_t *db_old; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - - /* first half of entries in old[b] go to new[2*b+0] */ - dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); - dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); - dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - dmu_buf_rele(db_old, FTAG); - - tbl->zt_blks_copied++; - - dprintf("copied block %llu of %llu\n", - (u_longlong_t)tbl->zt_blks_copied, - (u_longlong_t)tbl->zt_numblks); - - if (tbl->zt_blks_copied == tbl->zt_numblks) { - (void) dmu_free_range(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); - - tbl->zt_blk = newblk; - tbl->zt_numblks *= 2; - tbl->zt_shift++; - tbl->zt_nextblk = 0; - tbl->zt_blks_copied = 0; - - dprintf("finished; numblocks now %llu (%uk entries)\n", - (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10)); - } - - return (0); -} - -static int -zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, - dmu_tx_t *tx) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - - dprintf("storing %llx at index %llx\n", (u_longlong_t)val, - (u_longlong_t)idx); - - uint64_t blk = idx >> (bs-3); - uint64_t off = idx & ((1<<(bs-3))-1); - - dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - dmu_buf_will_dirty(db, tx); - - if (tbl->zt_nextblk != 0) { - uint64_t idx2 = idx * 2; - uint64_t blk2 = idx2 >> (bs-3); - uint64_t off2 = idx2 & ((1<<(bs-3))-1); - dmu_buf_t *db2; - - err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, - DMU_READ_NO_PREFETCH); - if (err != 0) { - dmu_buf_rele(db, FTAG); - return (err); - } - dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; - dmu_buf_rele(db2, FTAG); - } - - ((uint64_t *)db->db_data)[off] = val; - dmu_buf_rele(db, FTAG); - - return (0); -} - -static int -zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - uint64_t blk = idx >> (bs-3); - uint64_t off = idx & ((1<<(bs-3))-1); - - dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - *valp = ((uint64_t *)db->db_data)[off]; - dmu_buf_rele(db, FTAG); - - if (tbl->zt_nextblk != 0) { - /* - * read the nextblk for the sake of i/o error checking, - * so that zap_table_load() will catch errors for - * zap_table_store. - */ - blk = (idx*2) >> (bs-3); - - err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_nextblk + blk) << bs, FTAG, &db, - DMU_READ_NO_PREFETCH); - if (err == 0) - dmu_buf_rele(db, FTAG); - } - return (err); -} - -/* - * Routines for growing the ptrtbl. - */ - -static void -zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) -{ - for (int i = 0; i < n; i++) { - uint64_t lb = src[i]; - dst[2 * i + 0] = lb; - dst[2 * i + 1] = lb; - } -} - -static int -zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) -{ - /* - * The pointer table should never use more hash bits than we - * have (otherwise we'd be using useless zero bits to index it). - * If we are within 2 bits of running out, stop growing, since - * this is already an aberrant condition. - */ - if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) - return (SET_ERROR(ENOSPC)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - /* - * We are outgrowing the "embedded" ptrtbl (the one - * stored in the header block). Give it its own entire - * block, which will double the size of the ptrtbl. - */ - ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, - ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); - - uint64_t newblk = zap_allocate_blocks(zap, 1); - dmu_buf_t *db_new; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, - DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - dmu_buf_will_dirty(db_new, tx); - zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - dmu_buf_rele(db_new, FTAG); - - zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; - zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; - zap_f_phys(zap)->zap_ptrtbl.zt_shift++; - - ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << - (FZAP_BLOCK_SHIFT(zap)-3)); - - return (0); - } else { - return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, - zap_ptrtbl_transfer, tx)); - } -} - -static void -zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) -{ - dmu_buf_will_dirty(zap->zap_dbuf, tx); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); - ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); - zap_f_phys(zap)->zap_num_entries += delta; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); -} - static uint64_t -zap_allocate_blocks(zap_t *zap, int nblocks) +zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - uint64_t newblk = zap_f_phys(zap)->zap_freeblk; - zap_f_phys(zap)->zap_freeblk += nblocks; - return (newblk); -} + uint64_t obj; -static void -zap_leaf_evict_sync(void *dbu) -{ - zap_leaf_t *l = dbu; + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - -static zap_leaf_t * -zap_create_leaf(zap_t *zap, dmu_tx_t *tx) -{ - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - uint64_t blkid = zap_allocate_blocks(zap, 1); - dmu_buf_t *db = NULL; - - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, - DMU_READ_NO_PREFETCH)); - - /* - * Create the leaf structure and stash it on the dbuf. If zap was - * recent shrunk or truncated, the dbuf might have been sitting in the - * cache waiting to be evicted, and so still have the old leaf attached - * to it. If so, just reuse it. - */ - zap_leaf_t *l = dmu_buf_get_user(db); - if (l == NULL) { - l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - l->l_blkid = blkid; - l->l_dbuf = db; - rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, - &l->l_dbuf); - dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + if (allocated_dnode == NULL) { + dnode_t *dn; + obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + &dn, FTAG, tx); + mzap_create_impl(dn, normflags, flags, tx); + dnode_rele(dn, FTAG); } else { - ASSERT3U(l->l_blkid, ==, blkid); - ASSERT3P(l->l_dbuf, ==, db); + obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + allocated_dnode, tag, tx); + mzap_create_impl(*allocated_dnode, normflags, flags, tx); } - rw_enter(&l->l_rwlock, RW_WRITER); - dmu_buf_will_dirty(l->l_dbuf, tx); - - zap_leaf_init(l, zap->zap_normflags != 0); - - zap_f_phys(zap)->zap_num_leafs++; - - return (l); + return (obj); } -int -fzap_count(zap_t *zap, uint64_t *count) +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - ASSERT(!zap->zap_ismicro); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ - *count = zap_f_phys(zap)->zap_num_entries; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); - return (0); + return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); +} + +uint64_t +zap_create_dnsize(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, + dnodesize, tx)); +} + +uint64_t +zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, + 0, tx)); +} + +uint64_t +zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_impl(os, normflags, 0, ot, 0, 0, + bonustype, bonuslen, dnodesize, NULL, NULL, tx)); +} + +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_flags_dnsize(os, normflags, flags, ot, + leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); +} + +uint64_t +zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, + tx)); } /* - * Routines for obtaining zap_leaf_t's + * Create a zap object and return a pointer to the newly allocated dnode via + * the allocated_dnode argument. The returned dnode will be held and the + * caller is responsible for releasing the hold by calling dnode_rele(). */ - -void -zap_put_leaf(zap_leaf_t *l) +uint64_t +zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { - rw_exit(&l->l_rwlock); - dmu_buf_rele(l->l_dbuf, NULL); + return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + allocated_dnode, tag, tx)); } -static zap_leaf_t * -zap_open_leaf(uint64_t blkid, dmu_buf_t *db) -{ - ASSERT(blkid != 0); - - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = blkid; - l->l_bs = highbit64(db->db_size) - 1; - l->l_dbuf = db; - - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); - - rw_exit(&l->l_rwlock); - if (winner != NULL) { - /* someone else set it first */ - zap_leaf_evict_sync(&l->l_dbu); - l = winner; - } - - /* - * lhr_pad was previously used for the next leaf in the leaf - * chain. There should be no chained leafs (as we have removed - * support for them). - */ - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); - - /* - * There should be more hash entries than there can be - * chunks to put in the hash table - */ - ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); - - /* The chunks should begin at the end of the hash table */ - ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) - &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); - - /* The chunks should end at the end of the block */ - ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); - - return (l); -} - -static int -zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, - zap_leaf_t **lp) -{ - dmu_buf_t *db; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - /* - * If system crashed just after dmu_free_long_range in zfs_rmnode, we - * would be left with an empty xattr dir in delete queue. blkid=0 - * would be passed in when doing zfs_purgedir. If that's the case we - * should just return immediately. The underlying objects should - * already be freed, so this should be perfectly fine. - */ - if (blkid == 0) - return (SET_ERROR(ENOENT)); - - int bs = FZAP_BLOCK_SHIFT(zap); - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - - ASSERT3U(db->db_object, ==, zap->zap_object); - ASSERT3U(db->db_offset, ==, blkid << bs); - ASSERT3U(db->db_size, ==, 1 << bs); - ASSERT(blkid != 0); - - zap_leaf_t *l = dmu_buf_get_user(db); - - if (l == NULL) - l = zap_open_leaf(blkid, db); - - rw_enter(&l->l_rwlock, lt); - /* - * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, - * causing ASSERT below to fail. - */ - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - ASSERT3U(l->l_blkid, ==, blkid); - ASSERT3P(l->l_dbuf, ==, db); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - *lp = l; - return (0); -} - -static int -zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) -{ - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - ASSERT3U(idx, <, - (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); - *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); - return (0); - } else { - return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, - idx, valp)); - } -} - -static int -zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) -{ - ASSERT(tx != NULL); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { - ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; - return (0); - } else { - return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, - idx, blk, tx)); - } -} - -static int -zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, - dmu_tx_t *tx) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - int epb = bs >> 3; /* entries per block */ - int err = 0; - - ASSERT(tx != NULL); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - /* - * Check for i/o errors - */ - for (int i = 0; i < nptrs; i += epb) { - uint64_t blk; - err = zap_idx_to_blk(zap, idx + i, &blk); - if (err != 0) { - return (err); - } - } - - for (int i = 0; i < nptrs; i++) { - err = zap_set_idx_to_blk(zap, idx + i, blk, tx); - ASSERT0(err); /* we checked for i/o errors above */ - if (err != 0) - break; - } - - return (err); -} - -#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) - -/* - * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. - * If two leaves are siblings, their ranges are adjecent and contain the same - * number of entries. In order to find out if a leaf has a sibling, we need to - * check the range corresponding to the sibling leaf. There is no need to check - * all entries in the range, we only need to check the frist and the last one. - */ -static uint64_t -check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) -{ - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); - uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; - uint64_t nptrs = (1 << pref_diff); - uint64_t first; - uint64_t last; - - ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); - - if (zap_idx_to_blk(zap, idx, &first) != 0) - return (0); - - if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) - return (0); - - if (first != last) - return (0); - return (first); -} - -static int -zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) -{ - uint64_t blk; - - ASSERT(zap->zap_dbuf == NULL || - zap_f_phys(zap) == zap->zap_dbuf->db_data); - - /* Reality check for corrupt zap objects (leaf or header). */ - if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && - zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || - zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { - return (SET_ERROR(EIO)); - } - - uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - int err = zap_idx_to_blk(zap, idx, &blk); - if (err != 0) - return (err); - err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - - ASSERT(err || - ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == - zap_leaf_phys(*lp)->l_hdr.lh_prefix); - return (err); -} - -static int -zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, - const void *tag, dmu_tx_t *tx, zap_leaf_t **lp) -{ - zap_t *zap = zn->zn_zap; - uint64_t hash = zn->zn_hash; - int err; - int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - - ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - zap_leaf_phys(l)->l_hdr.lh_prefix); - - if (zap_tryupgradedir(zap, tx) == 0 || - old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { - /* We failed to upgrade, or need to grow the pointer table */ - objset_t *os = zap->zap_objset; - uint64_t object = zap->zap_object; - - zap_put_leaf(l); - *lp = l = NULL; - zap_unlockdir(zap, tag); - err = zap_lockdir(os, object, tx, RW_WRITER, - FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return (err); - ASSERT(!zap->zap_ismicro); - - while (old_prefix_len == - zap_f_phys(zap)->zap_ptrtbl.zt_shift) { - err = zap_grow_ptrtbl(zap, tx); - if (err != 0) - return (err); - } - - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - - if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { - /* it split while our locks were down */ - *lp = l; - return (0); - } - } - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - zap_leaf_phys(l)->l_hdr.lh_prefix); - - int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - - (old_prefix_len + 1); - uint64_t sibling = - (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; - - /* check for i/o errors before doing zap_leaf_split */ - for (int i = 0; i < (1ULL << prefix_diff); i++) { - uint64_t blk; - err = zap_idx_to_blk(zap, sibling + i, &blk); - if (err != 0) - return (err); - ASSERT3U(blk, ==, l->l_blkid); - } - - zap_leaf_t *nl = zap_create_leaf(zap, tx); - zap_leaf_split(l, nl, zap->zap_normflags != 0); - - /* set sibling pointers */ - for (int i = 0; i < (1ULL << prefix_diff); i++) { - err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); - ASSERT0(err); /* we checked for i/o errors above */ - } - - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0); - - if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { - /* we want the sibling */ - zap_put_leaf(l); - *lp = nl; - } else { - zap_put_leaf(nl); - *lp = l; - } - - return (0); -} - -static void -zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, - const void *tag, dmu_tx_t *tx) -{ - zap_t *zap = zn->zn_zap; - int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && - zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); - - zap_put_leaf(l); - - if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { - /* - * We are in the middle of growing the pointer table, or - * this leaf will soon make us grow it. - */ - if (zap_tryupgradedir(zap, tx) == 0) { - objset_t *os = zap->zap_objset; - uint64_t zapobj = zap->zap_object; - - zap_unlockdir(zap, tag); - int err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return; - } - - /* could have finished growing while our locks were down */ - if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) - (void) zap_grow_ptrtbl(zap, tx); - } -} - -static int -fzap_checkname(zap_name_t *zn) -{ - uint32_t maxnamelen = zn->zn_normbuf_len; - uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen; - /* Only allow directory zap to have longname */ - if (len > maxnamelen || - (len > ZAP_MAXNAMELEN && - zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS)) - return (SET_ERROR(ENAMETOOLONG)); - return (0); -} - -static int -fzap_checksize(uint64_t integer_size, uint64_t num_integers) -{ - /* Only integer sizes supported by C */ - switch (integer_size) { - case 1: - case 2: - case 4: - case 8: - break; - default: - return (SET_ERROR(EINVAL)); - } - - if (integer_size * num_integers > ZAP_MAXVALUELEN) - return (SET_ERROR(E2BIG)); - - return (0); -} - -static int -fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) -{ - int err = fzap_checkname(zn); - if (err != 0) - return (err); - return (fzap_checksize(integer_size, num_integers)); -} - -/* - * Routines for manipulating attributes. - */ -int -fzap_lookup(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, void *buf, - char *realname, int rn_len, boolean_t *ncp, - uint64_t *actual_num_integers) -{ - zap_leaf_t *l; - zap_entry_handle_t zeh; - - int err = fzap_checkname(zn); - if (err != 0) - return (err); - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - if ((err = fzap_checksize(integer_size, num_integers)) != 0) { - zap_put_leaf(l); - return (err); - } - - err = zap_entry_read(&zeh, integer_size, num_integers, buf); - if (err == 0 && actual_num_integers != NULL) - *actual_num_integers = zeh.zeh_num_integers; - (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); - if (ncp) { - *ncp = zap_entry_normalization_conflict(&zeh, - zn, NULL, zn->zn_zap); - } - } - - zap_put_leaf(l); - return (err); -} - -int -fzap_add_cd(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - zap_t *zap = zn->zn_zap; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(!zap->zap_ismicro); - ASSERT0(fzap_check(zn, integer_size, num_integers)); - - err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - err = SET_ERROR(EEXIST); - goto out; - } - if (err != ENOENT) - goto out; - - err = zap_entry_create(l, zn, cd, - integer_size, num_integers, val, &zeh); - - if (err == 0) { - zap_increment_num_entries(zap, 1, tx); - } else if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) - goto retry; - } - -out: - if (l != NULL) { - if (err == ENOSPC) - zap_put_leaf(l); - else - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); - } - return (err); -} - -int -fzap_add(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, const void *tag, dmu_tx_t *tx) -{ - int err = fzap_check(zn, integer_size, num_integers); - if (err != 0) - return (err); - - return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_NEED_CD, tag, tx)); -} - -int -fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, - const void *tag, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - boolean_t create; - zap_entry_handle_t zeh; - zap_t *zap = zn->zn_zap; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_check(zn, integer_size, num_integers); - if (err != 0) - return (err); - - err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, zn, &zeh); - create = (err == ENOENT); - ASSERT(err == 0 || err == ENOENT); - - if (create) { - err = zap_entry_create(l, zn, ZAP_NEED_CD, - integer_size, num_integers, val, &zeh); - if (err == 0) - zap_increment_num_entries(zap, 1, tx); - } else { - err = zap_entry_update(&zeh, integer_size, num_integers, val); - } - - if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) - goto retry; - } - - if (l != NULL) { - if (err == ENOSPC) - zap_put_leaf(l); - else - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); - } - return (err); -} - -int -fzap_length(zap_name_t *zn, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err != 0) - goto out; - - if (integer_size != NULL) - *integer_size = zeh.zeh_integer_size; - if (num_integers != NULL) - *num_integers = zeh.zeh_num_integers; -out: - zap_put_leaf(l); - return (err); -} - -int -fzap_remove(zap_name_t *zn, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - zap_entry_remove(&zeh); - zap_increment_num_entries(zn->zn_zap, -1, tx); - - if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && - zap_shrink_enabled) - return (zap_shrink(zn, l, tx)); - } - zap_put_leaf(l); - return (err); -} - -void -fzap_prefetch(zap_name_t *zn) -{ - uint64_t blk; - zap_t *zap = zn->zn_zap; - - uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, - zap_f_phys(zap)->zap_ptrtbl.zt_shift); - if (zap_idx_to_blk(zap, idx, &blk) != 0) - return; - int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, - ZIO_PRIORITY_SYNC_READ); -} - -/* - * Helper functions for consumers. - */ - uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) @@ -1109,6 +150,777 @@ zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, return (new_obj); } +int +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, + 0, tx)); +} + +int +zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, + 0, ot, bonustype, bonuslen, dnodesize, tx)); +} + +int +zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, + bonuslen, 0, tx)); +} + +int +zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, + dnodesize, tx); + if (error != 0) + return (error); + + error = dnode_hold(os, obj, FTAG, &dn); + if (error != 0) + return (error); + + mzap_create_impl(dn, normflags, 0, tx); + + dnode_rele(dn, FTAG); + + return (0); +} + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +/* + * Routines for manipulating attributes. + */ + +static int +zap_lookup_impl(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + int err = 0; + + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); + + if (!zap->zap_ismicro) { + err = fzap_lookup(zn, integer_size, num_integers, buf, + realname, rn_len, ncp, NULL); + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (num_integers < 1) { + err = SET_ERROR(EOVERFLOW); + } else if (integer_size != 8) { + err = SET_ERROR(EINVAL); + } else { + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; + if (realname != NULL) + (void) strlcpy(realname, + MZE_PHYS(zap, mze)->mze_name, + rn_len); + if (ncp) { + *ncp = mzap_normalization_conflict(zap, + zn, mze, &idx); + } + } + } + } + zap_name_free(zn); + return (err); +} + +int +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm(os, zapobj, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +static int +zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, + uint64_t *actual_num_integers) +{ + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + int err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL, actual_num_integers); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + + +int +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_length_uint64_impl(zap, key, key_numints, + integer_size, num_integers, buf, NULL); + /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_length_uint64_impl(zap, key, key_numints, + integer_size, num_integers, buf, NULL); + /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, + uint64_t *actual_num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_length_uint64_impl(zap, key, key_numints, + integer_size, num_integers, buf, actual_num_integers); + /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + int err = zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, 0, NULL, 0, NULL); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + +int +zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + zn = zap_name_alloc_str(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_prefetch_object(objset_t *os, uint64_t zapobj) +{ + int error; + dmu_object_info_t doi; + + error = dmu_object_info(os, zapobj, &doi); + if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) + error = SET_ERROR(EINVAL); + if (error == 0) + dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); + + return (error); +} + + +static int +zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) +{ + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (0); +} + +int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_prefetch_uint64_impl(zap, key, key_numints); + /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_prefetch_uint64_impl(zap, key, key_numints); + /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +int +zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +static int +zap_add_impl(zap_t *zap, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, const void *tag) +{ + const uint64_t *intval = val; + int err = 0; + + zap_name_t *zn = zap_name_alloc_str(zap, key, 0); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(key) >= MZAP_NAME_LEN || + !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { + err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); + if (err == 0) { + err = fzap_add(zn, integer_size, num_integers, val, + tag, tx); + } + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else { + zfs_btree_index_t idx; + if (mze_find(zn, &idx) != NULL) { + err = SET_ERROR(EEXIST); + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +static int +zap_add_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + const uint64_t *intval = val; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_update(zn, integer_size, num_integers, val, + FTAG, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + (u_longlong_t)zapobj, integer_size, + (u_longlong_t)num_integers, name); + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { + err = fzap_update(zn, integer_size, num_integers, + val, FTAG, tx); + } + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze != NULL) { + MZE_PHYS(zap, mze)->mze_value = *intval; + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +static int +zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, + const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_update(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, const void *val, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_length(zn, integer_size, num_integers); + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +static int +zap_remove_impl(zap_t *zap, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + int err = 0; + + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); + if (!zap->zap_ismicro) { + err = fzap_remove(zn, tx); + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + zap->zap_m.zap_num_entries--; + memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); + zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); + } + } + zap_name_free(zn); + return (err); +} + +int +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) +{ + return (zap_remove_norm(os, zapobj, name, 0, tx)); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, mt, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, 0, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +static int +zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_count_by_dnode(dnode_t *dn, uint64_t *count) +{ + zap_t *zap; + + int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap, FTAG); + return (err); +} + +/* + * Helper functions for consumers. + */ + int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name, uint64_t namelen) @@ -1213,7 +1025,6 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, zap_attribute_free(za); return (err); } - int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { @@ -1241,6 +1052,16 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) return (zap_lookup(os, obj, name, 8, 1, &value)); } +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} + int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) @@ -1290,428 +1111,238 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, return (err); } -int -zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_increment(os, obj, name, delta, tx)); -} - /* * Routines for iterating over the attributes. */ -int -fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +static void +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, boolean_t prefetch) { - int err; - zap_entry_handle_t zeh; - zap_leaf_t *l; - - /* retrieve the next entry at or after zc_hash/zc_cd */ - /* if no entry, return ENOENT */ - - /* - * If we are reading from the beginning, we're almost certain to - * iterate over the entire ZAP object. If there are multiple leaf - * blocks (freeblk > 2), prefetch the whole object (up to - * dmu_prefetch_max bytes), so that we read the leaf blocks - * concurrently. (Unless noprefetch was requested via - * zap_cursor_init_noprefetch()). - */ - if (zc->zc_hash == 0 && zap_iterate_prefetch && - zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, - zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), - ZIO_PRIORITY_ASYNC_READ); - } - - if (zc->zc_leaf) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - - /* - * The leaf was either shrunk or split. - */ - if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || - (ZAP_HASH_IDX(zc->zc_hash, - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - } - -again: - if (zc->zc_leaf == NULL) { - err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, - &zc->zc_leaf); - if (err != 0) - return (err); - } - l = zc->zc_leaf; - - err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); - - if (err == ENOENT) { - if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) { - zc->zc_hash = -1ULL; - zc->zc_cd = 0; - } else { - uint64_t nocare = (1ULL << - (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; - - zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; - zc->zc_cd = 0; - - if (zc->zc_hash == 0) { - zc->zc_hash = -1ULL; - } else { - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - goto again; - } - } - } - - if (err == 0) { - zc->zc_hash = zeh.zeh_hash; - zc->zc_cd = zeh.zeh_cd; - za->za_integer_length = zeh.zeh_integer_size; - za->za_num_integers = zeh.zeh_num_integers; - if (zeh.zeh_num_integers == 0) { - za->za_first_integer = 0; - } else { - err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); - ASSERT(err == 0 || err == EOVERFLOW); - } - err = zap_entry_read_name(zap, &zeh, - za->za_name_len, za->za_name); - ASSERT0(err); - - za->za_normalization_conflict = - zap_entry_normalization_conflict(&zeh, - NULL, za->za_name, zap); - } - rw_exit(&zc->zc_leaf->l_rwlock); - return (err); + zc->zc_objset = os; + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + zc->zc_zapobj = zapobj; + zc->zc_serialized = serialized; + zc->zc_hash = 0; + zc->zc_cd = 0; + zc->zc_prefetch = prefetch; } -static void -zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +/* + * Initialize a cursor at the beginning of the ZAP object. The entire + * ZAP object will be prefetched. + */ +void +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { - uint64_t lastblk = 0; + zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); +} - /* - * NB: if a leaf has more pointers than an entire ptrtbl block - * can hold, then it'll be accounted for more than once, since - * we won't have lastblk. - */ - for (int i = 0; i < len; i++) { - zap_leaf_t *l; - - if (tbl[i] == lastblk) - continue; - lastblk = tbl[i]; - - int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); - if (err == 0) { - zap_leaf_stats(zap, l, zs); - zap_put_leaf(l); - } - } +/* + * Initialize a cursor at the beginning, but request that we not prefetch + * the entire ZAP object. + */ +void +zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void -fzap_get_stats(zap_t *zap, zap_stats_t *zs) +zap_cursor_fini(zap_cursor_t *zc) { - int bs = FZAP_BLOCK_SHIFT(zap); - zs->zs_blocksize = 1ULL << bs; - - /* - * Set zap_phys_t fields - */ - zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; - zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; - zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; - zs->zs_block_type = zap_f_phys(zap)->zap_block_type; - zs->zs_magic = zap_f_phys(zap)->zap_magic; - zs->zs_salt = zap_f_phys(zap)->zap_salt; - - /* - * Set zap_ptrtbl fields - */ - zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; - zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; - zs->zs_ptrtbl_blks_copied = - zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; - zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; - zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; - zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - /* the ptrtbl is entirely in the header block. */ - zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); - } else { - dmu_prefetch_by_dnode(zap->zap_dnode, 0, - zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, - ZIO_PRIORITY_SYNC_READ); - - for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; - b++) { - dmu_buf_t *db; - int err; - - err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, - FTAG, &db, DMU_READ_NO_PREFETCH); - if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, - 1<<(bs-3), zs); - dmu_buf_rele(db, FTAG); - } - } + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlockdir(zc->zc_zap, NULL); + zc->zc_zap = NULL; } + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + zc->zc_objset = NULL; } -/* - * Find last allocated block and update freeblk. - */ -static void -zap_trunc(zap_t *zap) +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { - uint64_t nentries; - uint64_t lastblk; + int err; - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + if (zc->zc_hash == -1ULL) + return (SET_ERROR(ENOENT)); - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { - /* External ptrtbl */ - nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); - lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + - zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + if (zc->zc_zap == NULL) { + int hb; + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); + if (err != 0) + return (err); + + /* + * To support zap_cursor_init_serialized, advance, retrieve, + * we must add to the existing zc_cd, which may already + * be 1 due to the zap_cursor_advance. + */ + ASSERT0(zc->zc_hash); + hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = zc->zc_serialized << (64 - hb); + zc->zc_cd += zc->zc_serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; } else { - /* Embedded ptrtbl */ - nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - lastblk = 0; + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_retrieve(zc->zc_zap, zc, za); + } else { + zfs_btree_index_t idx; + mzap_ent_t mze_tofind; - for (uint64_t idx = 0; idx < nentries; idx++) { - uint64_t blk; - if (zap_idx_to_blk(zap, idx, &blk) != 0) - return; - if (blk > lastblk) - lastblk = blk; + mze_tofind.mze_hash = zc->zc_hash >> 32; + mze_tofind.mze_cd = zc->zc_cd; + + mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, + &mze_tofind, &idx); + if (mze == NULL) { + mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, + &idx, &idx); + } + if (mze) { + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); + za->za_normalization_conflict = + mzap_normalization_conflict(zc->zc_zap, NULL, + mze, &idx); + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mzep->mze_value; + (void) strlcpy(za->za_name, mzep->mze_name, + za->za_name_len); + zc->zc_hash = (uint64_t)mze->mze_hash << 32; + zc->zc_cd = mze->mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + err = SET_ERROR(ENOENT); + } } - - ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); - - zap_f_phys(zap)->zap_freeblk = lastblk + 1; -} - -/* - * ZAP shrinking algorithm. - * - * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf - * only if it has a sibling. Sibling leaves have the same prefix length and - * their prefixes differ only by the least significant (sibling) bit. We require - * both siblings to be empty. This eliminates a need to rehash the non-empty - * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl - * entries of the removed leaf to point out to the remaining leaf. Prefix length - * of the remaining leaf is decremented. As a result, it has a new prefix and it - * might have a new sibling. So, we repeat the process. - * - * Steps: - * 1. Check if a sibling leaf (sl) exists and it is empty. - * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. - * 3. Release the sibling (sl) to derefer it again with WRITER lock. - * 4. Upgrade zapdir lock to WRITER (once). - * 5. Derefer released leaves again. - * 6. If it is needed, recheck whether both leaves are still siblings and empty. - * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of - * the remaining leaf (slbit 0). - * 8. Free disk block of the removed leaf (dmu_free_range). - * 9. Decrement prefix_len of the remaining leaf. - * 10. Repeat the steps. - */ -static int -zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) -{ - zap_t *zap = zn->zn_zap; - int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - uint64_t hash = zn->zn_hash; - uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; - uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - boolean_t trunc = B_FALSE; - int err = 0; - - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); - ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); - - boolean_t writer = B_FALSE; - - /* - * To avoid deadlock always deref leaves in the same order - - * sibling 0 first, then sibling 1. - */ - while (prefix_len) { - zap_leaf_t *sl; - int64_t prefix_diff = zt_shift - prefix_len; - uint64_t sl_prefix = prefix ^ 1; - uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); - int slbit = prefix & 1; - - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); - - /* - * Check if there is a sibling by reading ptrtbl ptrs. - */ - if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) - break; - - /* - * sibling 1, unlock it - we haven't yet dereferenced sibling 0. - */ - if (slbit == 1) { - zap_put_leaf(l); - l = NULL; - } - - /* - * Dereference sibling leaf and check if it is empty. - */ - if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, - &sl)) != 0) - break; - - ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); - - /* - * Check if we have a sibling and it is empty. - */ - if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || - zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { - zap_put_leaf(sl); - break; - } - - zap_put_leaf(sl); - - /* - * If there two empty sibling, we have work to do, so - * we need to lock ZAP ptrtbl as WRITER. - */ - if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { - /* We failed to upgrade */ - if (l != NULL) { - zap_put_leaf(l); - l = NULL; - } - - /* - * Usually, the right way to upgrade from a READER lock - * to a WRITER lock is to call zap_unlockdir() and - * zap_lockdir(), but we do not have a tag. Instead, - * we do it in more sophisticated way. - */ - rw_exit(&zap->zap_rwlock); - rw_enter(&zap->zap_rwlock, RW_WRITER); - dmu_buf_will_dirty(zap->zap_dbuf, tx); - - zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - writer = B_TRUE; - } - - /* - * Here we have WRITER lock for ptrtbl. - * Now, we need a WRITER lock for both siblings leaves. - * Also, we have to recheck if the leaves are still siblings - * and still empty. - */ - if (l == NULL) { - /* sibling 0 */ - if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), - tx, RW_WRITER, &l)) != 0) - break; - - /* - * The leaf isn't empty anymore or - * it was shrunk/split while our locks were down. - */ - if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || - zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) - break; - } - - /* sibling 1 */ - if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, - RW_WRITER, &sl)) != 0) - break; - - /* - * The leaf isn't empty anymore or - * it was shrunk/split while our locks were down. - */ - if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || - zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { - zap_put_leaf(sl); - break; - } - - /* If we have gotten here, we have a leaf to collapse */ - uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; - uint64_t nptrs = (1ULL << prefix_diff); - uint64_t sl_blkid = sl->l_blkid; - - /* - * Set ptrtbl entries to point out to the slibling 0 blkid - */ - if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, - tx)) != 0) { - zap_put_leaf(sl); - break; - } - - /* - * Free sibling 1 disk block. - */ - int bs = FZAP_BLOCK_SHIFT(zap); - if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) - trunc = B_TRUE; - - (void) dmu_free_range(zap->zap_objset, zap->zap_object, - sl_blkid << bs, 1 << bs, tx); - zap_put_leaf(sl); - - zap_f_phys(zap)->zap_num_leafs--; - - /* - * Update prefix and prefix_len. - */ - zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; - zap_leaf_phys(l)->l_hdr.lh_prefix_len--; - - prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; - prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - } - - if (trunc) - zap_trunc(zap); - - if (l != NULL) - zap_put_leaf(l); - + rw_exit(&zc->zc_zap->zap_rwlock); return (err); } -ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, - "When iterating ZAP object, prefetch it"); +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; +} -ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, - "Enable ZAP shrinking"); +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return (-1ULL); + if (zc->zc_zap == NULL) + return (zc->zc_serialized); + ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap))); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); +} + +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + memset(zs, 0, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlockdir(zap, FTAG); + return (0); +} + +EXPORT_SYMBOL(zap_create); +EXPORT_SYMBOL(zap_create_dnsize); +EXPORT_SYMBOL(zap_create_norm); +EXPORT_SYMBOL(zap_create_norm_dnsize); +EXPORT_SYMBOL(zap_create_flags); +EXPORT_SYMBOL(zap_create_flags_dnsize); +EXPORT_SYMBOL(zap_create_claim); +EXPORT_SYMBOL(zap_create_claim_norm); +EXPORT_SYMBOL(zap_create_claim_norm_dnsize); +EXPORT_SYMBOL(zap_create_hold); +EXPORT_SYMBOL(zap_destroy); +EXPORT_SYMBOL(zap_lookup); +EXPORT_SYMBOL(zap_lookup_by_dnode); +EXPORT_SYMBOL(zap_lookup_norm); +EXPORT_SYMBOL(zap_lookup_uint64); +EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode); +EXPORT_SYMBOL(zap_contains); +EXPORT_SYMBOL(zap_prefetch); +EXPORT_SYMBOL(zap_prefetch_uint64); +EXPORT_SYMBOL(zap_prefetch_object); +EXPORT_SYMBOL(zap_add); +EXPORT_SYMBOL(zap_add_by_dnode); +EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_add_uint64_by_dnode); +EXPORT_SYMBOL(zap_update); +EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_update_uint64_by_dnode); +EXPORT_SYMBOL(zap_length); +EXPORT_SYMBOL(zap_length_uint64); +EXPORT_SYMBOL(zap_length_uint64_by_dnode); +EXPORT_SYMBOL(zap_remove); +EXPORT_SYMBOL(zap_remove_by_dnode); +EXPORT_SYMBOL(zap_remove_norm); +EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_remove_uint64_by_dnode); +EXPORT_SYMBOL(zap_count); +EXPORT_SYMBOL(zap_count_by_dnode); +EXPORT_SYMBOL(zap_value_search); +EXPORT_SYMBOL(zap_join); +EXPORT_SYMBOL(zap_join_increment); +EXPORT_SYMBOL(zap_add_int); +EXPORT_SYMBOL(zap_remove_int); +EXPORT_SYMBOL(zap_lookup_int); +EXPORT_SYMBOL(zap_increment_int); +EXPORT_SYMBOL(zap_add_int_key); +EXPORT_SYMBOL(zap_lookup_int_key); +EXPORT_SYMBOL(zap_increment); +EXPORT_SYMBOL(zap_cursor_init); +EXPORT_SYMBOL(zap_cursor_fini); +EXPORT_SYMBOL(zap_cursor_retrieve); +EXPORT_SYMBOL(zap_cursor_advance); +EXPORT_SYMBOL(zap_cursor_serialize); +EXPORT_SYMBOL(zap_cursor_init_serialized); +EXPORT_SYMBOL(zap_get_stats); diff --git a/module/zfs/zap_fat.c b/module/zfs/zap_fat.c new file mode 100644 index 00000000000..6e2f076cfc3 --- /dev/null +++ b/module/zfs/zap_fat.c @@ -0,0 +1,1502 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 Alexander Stetsenko + * Copyright (c) 2023, Klara Inc. + */ + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object + * (all leaf blocks) when we start iterating over it. + * + * For zap_cursor_init(), the callers all intend to iterate through all the + * entries. There are a few cases where an error (typically i/o error) could + * cause it to bail out early. + * + * For zap_cursor_init_serialized(), there are callers that do the iteration + * outside of ZFS. Typically they would iterate over everything, but we + * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), + * zcp_snapshots_iter(), and other iterators over things in the MOS - these + * are called by /sbin/zfs and channel programs. The other example is + * zfs_readdir() which iterates over directory entries for the getdents() + * syscall. /sbin/ls iterates to the end (unless it receives a signal), but + * userland doesn't have to. + * + * Given that the ZAP entries aren't returned in a specific order, the only + * legitimate use cases for partial iteration would be: + * + * 1. Pagination: e.g. you only want to display 100 entries at a time, so you + * get the first 100 and then wait for the user to hit "next page", which + * they may never do). + * + * 2. You want to know if there are more than X entries, without relying on + * the zfs-specific implementation of the directory's st_size (which is + * the number of entries). + */ +static int zap_iterate_prefetch = B_TRUE; + +/* + * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be + * collapsed into a single block. + */ +int zap_shrink_enabled = B_TRUE; + +int fzap_default_block_shift = 14; /* 16k blocksize */ + +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type = *(uint64_t *)vbuf; + + if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) + zap_leaf_byteswap(vbuf, size); + else { + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, size); + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; + zap->zap_dbu.dbu_evict_func_async = NULL; + + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); + zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; + + zap_phys_t *zp = zap_f_phys(zap); + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + zp->zap_normflags = zap->zap_normflags; + zp->zap_flags = flags; + + /* block 1 will be the first leaf */ + for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) + ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; + + /* + * set up block 1 - the first leaf + */ + dmu_buf_t *db; + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + 1<l_dbuf = db; + + zap_leaf_init(l, zp->zap_normflags != 0); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db, FTAG); +} + +static int +zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + return (1); + if (rw_tryupgrade(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + return (0); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static int +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t newblk; + int bs = FZAP_BLOCK_SHIFT(zap); + int hepb = 1<<(bs-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); + tbl->zt_nextblk = newblk; + ASSERT0(tbl->zt_blks_copied); + dmu_prefetch_by_dnode(zap->zap_dnode, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Copy the ptrtbl from the old to new location. + */ + + uint64_t b = tbl->zt_blks_copied; + dmu_buf_t *db_old; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + + /* first half of entries in old[b] go to new[2*b+0] */ + dmu_buf_t *db_new; + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + /* second half of entries in old[b] go to new[2*b+1] */ + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + dmu_buf_rele(db_old, FTAG); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + (u_longlong_t)tbl->zt_blks_copied, + (u_longlong_t)tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%uk entries)\n", + (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } + + return (0); +} + +static int +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", (u_longlong_t)val, + (u_longlong_t)idx); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db, tx); + + if (tbl->zt_nextblk != 0) { + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, + DMU_READ_NO_PREFETCH); + if (err != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); + } + + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); +} + +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_nextblk + blk) << bs, FTAG, &db, + DMU_READ_NO_PREFETCH); + if (err == 0) + dmu_buf_rele(db, FTAG); + } + return (err); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + for (int i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2 * i + 0] = lb; + dst[2 * i + 1] = lb; + } +} + +static int +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + /* + * The pointer table should never use more hash bits than we + * have (otherwise we'd be using useless zero bits to index it). + * If we are within 2 bits of running out, stop growing, since + * this is already an aberrant condition. + */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) + return (SET_ERROR(ENOSPC)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* + * We are outgrowing the "embedded" ptrtbl (the one + * stored in the header block). Give it its own entire + * block, which will double the size of the ptrtbl. + */ + ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); + + uint64_t newblk = zap_allocate_blocks(zap, 1); + dmu_buf_t *db_new; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, + DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + dmu_buf_rele(db_new, FTAG); + + zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; + zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; + zap_f_phys(zap)->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << + (FZAP_BLOCK_SHIFT(zap)-3)); + + return (0); + } else { + return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, + zap_ptrtbl_transfer, tx)); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); + zap_f_phys(zap)->zap_num_entries += delta; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +static uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + uint64_t newblk = zap_f_phys(zap)->zap_freeblk; + zap_f_phys(zap)->zap_freeblk += nblocks; + return (newblk); +} + +static void +zap_leaf_evict_sync(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + uint64_t blkid = zap_allocate_blocks(zap, 1); + dmu_buf_t *db = NULL; + + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, + DMU_READ_NO_PREFETCH)); + + /* + * Create the leaf structure and stash it on the dbuf. If zap was + * recent shrunk or truncated, the dbuf might have been sitting in the + * cache waiting to be evicted, and so still have the old leaf attached + * to it. If so, just reuse it. + */ + zap_leaf_t *l = dmu_buf_get_user(db); + if (l == NULL) { + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_blkid = blkid; + l->l_dbuf = db; + rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, + &l->l_dbuf); + dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + } else { + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + } + + rw_enter(&l->l_rwlock, RW_WRITER); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l, zap->zap_normflags != 0); + + zap_f_phys(zap)->zap_num_leafs++; + + return (l); +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap_f_phys(zap)->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +void +zap_put_leaf(zap_leaf_t *l) +{ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf, NULL); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + ASSERT(blkid != 0); + + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_bs = highbit64(db->db_size) - 1; + l->l_dbuf = db; + + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_evict_sync(&l->l_dbu); + l = winner; + } + + /* + * lhr_pad was previously used for the next leaf in the leaf + * chain. There should be no chained leafs (as we have removed + * support for them). + */ + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); + + /* + * There should be more hash entries than there can be + * chunks to put in the hash table + */ + ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); + + /* The chunks should begin at the end of the hash table */ + ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) + &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + + /* The chunks should end at the end of the block */ + ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - + (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); + + return (l); +} + +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) +{ + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + /* + * If system crashed just after dmu_free_long_range in zfs_rmnode, we + * would be left with an empty xattr dir in delete queue. blkid=0 + * would be passed in when doing zfs_purgedir. If that's the case we + * should just return immediately. The underlying objects should + * already be freed, so this should be perfectly fine. + */ + if (blkid == 0) + return (SET_ERROR(ENOENT)); + + int bs = FZAP_BLOCK_SHIFT(zap); + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << bs); + ASSERT3U(db->db_size, ==, 1 << bs); + ASSERT(blkid != 0); + + zap_leaf_t *l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + *lp = l; + return (0); +} + +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); + } else { + return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, valp)); + } +} + +static int +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { + ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); + } else { + return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, blk, tx)); + } +} + +static int +zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + int epb = bs >> 3; /* entries per block */ + int err = 0; + + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + /* + * Check for i/o errors + */ + for (int i = 0; i < nptrs; i += epb) { + uint64_t blk; + err = zap_idx_to_blk(zap, idx + i, &blk); + if (err != 0) { + return (err); + } + } + + for (int i = 0; i < nptrs; i++) { + err = zap_set_idx_to_blk(zap, idx + i, blk, tx); + ASSERT0(err); /* we checked for i/o errors above */ + if (err != 0) + break; + } + + return (err); +} + +#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +/* + * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. + * If two leaves are siblings, their ranges are adjecent and contain the same + * number of entries. In order to find out if a leaf has a sibling, we need to + * check the range corresponding to the sibling leaf. There is no need to check + * all entries in the range, we only need to check the frist and the last one. + */ +static uint64_t +check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; + uint64_t nptrs = (1 << pref_diff); + uint64_t first; + uint64_t last; + + ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + + if (zap_idx_to_blk(zap, idx, &first) != 0) + return (0); + + if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) + return (0); + + if (first != last) + return (0); + return (first); +} + +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) +{ + uint64_t blk; + + ASSERT(zap->zap_dbuf == NULL || + zap_f_phys(zap) == zap->zap_dbuf->db_data); + + /* Reality check for corrupt zap objects (leaf or header). */ + if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && + zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || + zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { + return (SET_ERROR(EIO)); + } + + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + int err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); + + ASSERT(err || + ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == + zap_leaf_phys(*lp)->l_hdr.lh_prefix); + return (err); +} + +static int +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, + const void *tag, dmu_tx_t *tx, zap_leaf_t **lp) +{ + zap_t *zap = zn->zn_zap; + uint64_t hash = zn->zn_hash; + int err; + int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + if (zap_tryupgradedir(zap, tx) == 0 || + old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + /* We failed to upgrade, or need to grow the pointer table */ + objset_t *os = zap->zap_objset; + uint64_t object = zap->zap_object; + + zap_put_leaf(l); + *lp = l = NULL; + zap_unlockdir(zap, tag); + err = zap_lockdir(os, object, tx, RW_WRITER, + FALSE, FALSE, tag, &zn->zn_zap); + zap = zn->zn_zap; + if (err != 0) + return (err); + ASSERT(!zap->zap_ismicro); + + while (old_prefix_len == + zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + err = zap_grow_ptrtbl(zap, tx); + if (err != 0) + return (err); + } + + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { + /* it split while our locks were down */ + *lp = l; + return (0); + } + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + (old_prefix_len + 1); + uint64_t sibling = + (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + uint64_t blk; + err = zap_idx_to_blk(zap, sibling + i, &blk); + if (err != 0) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + zap_leaf_t *nl = zap_create_leaf(zap, tx); + zap_leaf_split(l, nl, zap->zap_normflags != 0); + + /* set sibling pointers */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); + ASSERT0(err); /* we checked for i/o errors above */ + } + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0); + + if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + *lp = nl; + } else { + zap_put_leaf(nl); + *lp = l; + } + + return (0); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, + const void *tag, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && + zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + + zap_put_leaf(l); + + if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { + /* + * We are in the middle of growing the pointer table, or + * this leaf will soon make us grow it. + */ + if (zap_tryupgradedir(zap, tx) == 0) { + objset_t *os = zap->zap_objset; + uint64_t zapobj = zap->zap_object; + + zap_unlockdir(zap, tag); + int err = zap_lockdir(os, zapobj, tx, + RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); + zap = zn->zn_zap; + if (err != 0) + return; + } + + /* could have finished growing while our locks were down */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) + (void) zap_grow_ptrtbl(zap, tx); + } +} + +static int +fzap_checkname(zap_name_t *zn) +{ + uint32_t maxnamelen = zn->zn_normbuf_len; + uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen; + /* Only allow directory zap to have longname */ + if (len > maxnamelen || + (len > ZAP_MAXNAMELEN && + zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS)) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (SET_ERROR(EINVAL)); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (SET_ERROR(E2BIG)); + + return (0); +} + +static int +fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) +{ + int err = fzap_checkname(zn); + if (err != 0) + return (err); + return (fzap_checksize(integer_size, num_integers)); +} + +/* + * Routines for manipulating attributes. + */ +int +fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *ncp, + uint64_t *actual_num_integers) +{ + zap_leaf_t *l; + zap_entry_handle_t zeh; + + int err = fzap_checkname(zn); + if (err != 0) + return (err); + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + if ((err = fzap_checksize(integer_size, num_integers)) != 0) { + zap_put_leaf(l); + return (err); + } + + err = zap_entry_read(&zeh, integer_size, num_integers, buf); + if (err == 0 && actual_num_integers != NULL) + *actual_num_integers = zeh.zeh_num_integers; + (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); + if (ncp) { + *ncp = zap_entry_normalization_conflict(&zeh, + zn, NULL, zn->zn_zap); + } + } + + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT0(fzap_check(zn, integer_size, num_integers)); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + err = SET_ERROR(EEXIST); + goto out; + } + if (err != ENOENT) + goto out; + + err = zap_entry_create(l, zn, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tag, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + +out: + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } + return (err); +} + +int +fzap_add(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, const void *tag, dmu_tx_t *tx) +{ + int err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + return (fzap_add_cd(zn, integer_size, num_integers, + val, ZAP_NEED_CD, tag, tx)); +} + +int +fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, + const void *tag, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + boolean_t create; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + if (create) { + err = zap_entry_create(l, zn, ZAP_NEED_CD, + integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tag, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } + return (err); +} + +int +fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + goto out; + + if (integer_size != NULL) + *integer_size = zeh.zeh_integer_size; + if (num_integers != NULL) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_name_t *zn, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zn->zn_zap, -1, tx); + + if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && + zap_shrink_enabled) + return (zap_shrink(zn, l, tx)); + } + zap_put_leaf(l); + return (err); +} + +void +fzap_prefetch(zap_name_t *zn) +{ + uint64_t blk; + zap_t *zap = zn->zn_zap; + + uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, + zap_f_phys(zap)->zap_ptrtbl.zt_shift); + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + int bs = FZAP_BLOCK_SHIFT(zap); + dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); +} + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + + /* + * If we are reading from the beginning, we're almost certain to + * iterate over the entire ZAP object. If there are multiple leaf + * blocks (freeblk > 2), prefetch the whole object (up to + * dmu_prefetch_max bytes), so that we read the leaf blocks + * concurrently. (Unless noprefetch was requested via + * zap_cursor_init_noprefetch()). + */ + if (zc->zc_hash == 0 && zap_iterate_prefetch && + zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { + dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, + zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), + ZIO_PRIORITY_ASYNC_READ); + } + + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + + /* + * The leaf was either shrunk or split. + */ + if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + } + +again: + if (zc->zc_leaf == NULL) { + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); + } + l = zc->zc_leaf; + + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) { + zc->zc_hash = -1ULL; + zc->zc_cd = 0; + } else { + uint64_t nocare = (1ULL << + (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; + + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + + if (zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + goto again; + } + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(zap, &zeh, + za->za_name_len, za->za_name); + ASSERT0(err); + + za->za_normalization_conflict = + zap_entry_normalization_conflict(&zeh, + NULL, za->za_name, zap); + } + rw_exit(&zc->zc_leaf->l_rwlock); + return (err); +} + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (int i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_leaf_stats(zap, l, zs); + zap_put_leaf(l); + } + } +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + zs->zs_blocksize = 1ULL << bs; + + /* + * Set zap_phys_t fields + */ + zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; + zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; + zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; + zs->zs_block_type = zap_f_phys(zap)->zap_block_type; + zs->zs_magic = zap_f_phys(zap)->zap_magic; + zs->zs_salt = zap_f_phys(zap)->zap_salt; + + /* + * Set zap_ptrtbl fields + */ + zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_blks_copied = + zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); + } else { + dmu_prefetch_by_dnode(zap->zap_dnode, 0, + zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + + for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + int err; + + err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } + } + } +} + +/* + * Find last allocated block and update freeblk. + */ +static void +zap_trunc(zap_t *zap) +{ + uint64_t nentries; + uint64_t lastblk; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { + /* External ptrtbl */ + nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); + lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + + zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + } else { + /* Embedded ptrtbl */ + nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + lastblk = 0; + } + + for (uint64_t idx = 0; idx < nentries; idx++) { + uint64_t blk; + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + if (blk > lastblk) + lastblk = blk; + } + + ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); + + zap_f_phys(zap)->zap_freeblk = lastblk + 1; +} + +/* + * ZAP shrinking algorithm. + * + * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf + * only if it has a sibling. Sibling leaves have the same prefix length and + * their prefixes differ only by the least significant (sibling) bit. We require + * both siblings to be empty. This eliminates a need to rehash the non-empty + * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl + * entries of the removed leaf to point out to the remaining leaf. Prefix length + * of the remaining leaf is decremented. As a result, it has a new prefix and it + * might have a new sibling. So, we repeat the process. + * + * Steps: + * 1. Check if a sibling leaf (sl) exists and it is empty. + * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. + * 3. Release the sibling (sl) to derefer it again with WRITER lock. + * 4. Upgrade zapdir lock to WRITER (once). + * 5. Derefer released leaves again. + * 6. If it is needed, recheck whether both leaves are still siblings and empty. + * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of + * the remaining leaf (slbit 0). + * 8. Free disk block of the removed leaf (dmu_free_range). + * 9. Decrement prefix_len of the remaining leaf. + * 10. Repeat the steps. + */ +static int +zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + uint64_t hash = zn->zn_hash; + uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + boolean_t trunc = B_FALSE; + int err = 0; + + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); + ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); + + boolean_t writer = B_FALSE; + + /* + * To avoid deadlock always deref leaves in the same order - + * sibling 0 first, then sibling 1. + */ + while (prefix_len) { + zap_leaf_t *sl; + int64_t prefix_diff = zt_shift - prefix_len; + uint64_t sl_prefix = prefix ^ 1; + uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); + int slbit = prefix & 1; + + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); + + /* + * Check if there is a sibling by reading ptrtbl ptrs. + */ + if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) + break; + + /* + * sibling 1, unlock it - we haven't yet dereferenced sibling 0. + */ + if (slbit == 1) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Dereference sibling leaf and check if it is empty. + */ + if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, + &sl)) != 0) + break; + + ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); + + /* + * Check if we have a sibling and it is empty. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || + zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { + zap_put_leaf(sl); + break; + } + + zap_put_leaf(sl); + + /* + * If there two empty sibling, we have work to do, so + * we need to lock ZAP ptrtbl as WRITER. + */ + if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { + /* We failed to upgrade */ + if (l != NULL) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Usually, the right way to upgrade from a READER lock + * to a WRITER lock is to call zap_unlockdir() and + * zap_lockdir(), but we do not have a tag. Instead, + * we do it in more sophisticated way. + */ + rw_exit(&zap->zap_rwlock); + rw_enter(&zap->zap_rwlock, RW_WRITER); + dmu_buf_will_dirty(zap->zap_dbuf, tx); + + zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + writer = B_TRUE; + } + + /* + * Here we have WRITER lock for ptrtbl. + * Now, we need a WRITER lock for both siblings leaves. + * Also, we have to recheck if the leaves are still siblings + * and still empty. + */ + if (l == NULL) { + /* sibling 0 */ + if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), + tx, RW_WRITER, &l)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) + break; + } + + /* sibling 1 */ + if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, + RW_WRITER, &sl)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { + zap_put_leaf(sl); + break; + } + + /* If we have gotten here, we have a leaf to collapse */ + uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; + uint64_t nptrs = (1ULL << prefix_diff); + uint64_t sl_blkid = sl->l_blkid; + + /* + * Set ptrtbl entries to point out to the slibling 0 blkid + */ + if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, + tx)) != 0) { + zap_put_leaf(sl); + break; + } + + /* + * Free sibling 1 disk block. + */ + int bs = FZAP_BLOCK_SHIFT(zap); + if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) + trunc = B_TRUE; + + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + sl_blkid << bs, 1 << bs, tx); + zap_put_leaf(sl); + + zap_f_phys(zap)->zap_num_leafs--; + + /* + * Update prefix and prefix_len. + */ + zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len--; + + prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + } + + if (trunc) + zap_trunc(zap); + + if (l != NULL) + zap_put_leaf(l); + + return (err); +} + +ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, + "When iterating ZAP object, prefetch it"); + +ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, + "Enable ZAP shrinking"); diff --git a/module/zfs/zap_impl.c b/module/zfs/zap_impl.c new file mode 100644 index 00000000000..8788480318f --- /dev/null +++ b/module/zfs/zap_impl.c @@ -0,0 +1,527 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. + */ + +#include +#include +#include +#include +#include +#include + +/* + * This routine "consumes" the caller's hold on the dbuf, which must + * have the specified tag. + */ +int +zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) +{ + ASSERT0(db->db_offset); + objset_t *os = dmu_buf_get_objset(db); + uint64_t obj = db->db_object; + + *zapp = NULL; + + if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP) + return (SET_ERROR(EINVAL)); + + zap_t *zap = dmu_buf_get_user(db); + if (zap == NULL) { + zap = mzap_open(db); + if (zap == NULL) { + /* + * mzap_open() didn't like what it saw on-disk. + * Check for corruption! + */ + return (SET_ERROR(EIO)); + } + } + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + zap->zap_dnode = dn; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && adding && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { + dprintf("upgrading obj %llu: num_entries=%u\n", + (u_longlong_t)obj, zap->zap_m.zap_num_entries); + *zapp = zap; + int err = mzap_upgrade(zapp, tag, tx, 0); + if (err != 0) + rw_exit(&zap->zap_rwlock); + return (err); + } + VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + + if (newsz > SPA_OLD_MAXBLOCKSIZE) { + dsl_dataset_t *ds = dmu_objset_ds(os); + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_MICROZAP)) { + /* + * A microzap just grew beyond the old limit + * for the first time, so we have to ensure the + * feature flag is activated. + * zap_get_micro_max_size() won't let us get + * here if the feature is not enabled, so we + * don't need any other checks beforehand. + * + * Since we're in open context, we can't + * activate the feature directly, so we instead + * flag it on the dataset for next sync. + */ + dsl_dataset_dirty(ds, tx); + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation + [SPA_FEATURE_LARGE_MICROZAP] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + } + } + + *zapp = zap; + return (0); +} + +int +zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp) +{ + dmu_buf_t *db; + int err; + + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) + dmu_buf_rele(db, tag); + else + VERIFY(dnode_add_ref(dn, tag)); + return (err); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp) +{ + dnode_t *dn; + dmu_buf_t *db; + int err; + + err = dnode_hold(os, obj, tag, &dn); + if (err != 0) + return (err); + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, tag); + return (err); + } + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { + dmu_buf_rele(db, tag); + dnode_rele(dn, tag); + } + return (err); +} + +void +zap_unlockdir(zap_t *zap, const void *tag) +{ + rw_exit(&zap->zap_rwlock); + dnode_rele(zap->zap_dnode, tag); + dmu_buf_rele(zap->zap_dbuf, tag); +} + +static kmem_cache_t *zap_name_cache; +static kmem_cache_t *zap_attr_cache; +static kmem_cache_t *zap_name_long_cache; +static kmem_cache_t *zap_attr_long_cache; + +void +zap_init(void) +{ + zap_name_cache = kmem_cache_create("zap_name", + sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, + NULL, NULL, NULL, 0); + + zap_attr_cache = kmem_cache_create("zap_attr_cache", + sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, + NULL, NULL, NULL, NULL, 0); + + zap_name_long_cache = kmem_cache_create("zap_name_long", + sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, + NULL, NULL, NULL, 0); + + zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", + sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, + NULL, NULL, NULL, NULL, 0); +} + +void +zap_fini(void) +{ + kmem_cache_destroy(zap_name_cache); + kmem_cache_destroy(zap_attr_cache); + kmem_cache_destroy(zap_name_long_cache); + kmem_cache_destroy(zap_attr_long_cache); +} + +zap_name_t * +zap_name_alloc(zap_t *zap, boolean_t longname) +{ + kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; + zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); + + zn->zn_zap = zap; + zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; + return (zn); +} + +void +zap_name_free(zap_name_t *zn) +{ + if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { + kmem_cache_free(zap_name_cache, zn); + } else { + ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); + kmem_cache_free(zap_name_long_cache, zn); + } +} + +int +zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) +{ + zap_t *zap = zn->zn_zap; + size_t key_len = strlen(key) + 1; + + /* Make sure zn is allocated for longname if key is long */ + IMPLY(key_len > ZAP_MAXNAMELEN, + zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); + + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = key_len; + zn->zn_matchtype = mt; + zn->zn_normflags = zap->zap_normflags; + + /* + * If we're dealing with a case sensitive lookup on a mixed or + * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup + * will fold case to all caps overriding the lookup request. + */ + if (mt & MT_MATCH_CASE) + zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; + + if (zap->zap_normflags) { + /* + * We *must* use zap_normflags because this normalization is + * what the hash is computed from. + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zap->zap_normflags, zn->zn_normbuf_len) != 0) + return (SET_ERROR(ENOTSUP)); + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } else { + if (mt != 0) + return (SET_ERROR(ENOTSUP)); + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; + } + + zn->zn_hash = zap_hash(zn); + + if (zap->zap_normflags != zn->zn_normflags) { + /* + * We *must* use zn_normflags because this normalization is + * what the matching is based on. (Not the hash!) + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zn->zn_normflags, zn->zn_normbuf_len) != 0) + return (SET_ERROR(ENOTSUP)); + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } + + return (0); +} + +zap_name_t * +zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) +{ + size_t key_len = strlen(key) + 1; + zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); + if (zap_name_init_str(zn, key, mt) != 0) { + zap_name_free(zn); + return (NULL); + } + return (zn); +} + +zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); + + ASSERT0(zap->zap_normflags); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = 0; + zn->zn_normbuf_len = ZAP_MAXNAMELEN; + + zn->zn_hash = zap_hash(zn); + return (zn); +} + +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap_f_phys(zap)->zap_flags); +} + +int +zap_hashbits(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} + +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} + +uint64_t +zap_hash(zap_name_t *zn) +{ + zap_t *zap = zn->zn_zap; + uint64_t h = 0; + + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (int i = 0; i < zn->zn_key_norm_numints; + wp++, i++) { + uint64_t word = *wp; + + for (int j = 0; j < 8; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + const uint8_t *cp = zn->zn_key_norm; + + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + int len = zn->zn_key_norm_numints - 1; + + ASSERT(zn->zn_key_intlen == 1); + for (int i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } + } + /* + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when + * choosing the bucket. + */ + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); + + return (h); +} + +int +zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, + size_t outlen) +{ + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + + size_t inlen = strlen(name) + 1; + + int err = 0; + (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, + normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, + U8_UNICODE_LATEST, &err); + + return (err); +} + +boolean_t +zap_match(zap_name_t *zn, const char *matchname) +{ + boolean_t res = B_FALSE; + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + + if (zn->zn_matchtype & MT_NORMALIZE) { + size_t namelen = zn->zn_normbuf_len; + char normbuf[ZAP_MAXNAMELEN]; + char *norm = normbuf; + + /* + * Cannot allocate this on-stack as it exceed the stack-limit of + * 1024. + */ + if (namelen > ZAP_MAXNAMELEN) + norm = kmem_alloc(namelen, KM_SLEEP); + + if (zap_normalize(zn->zn_zap, matchname, norm, + zn->zn_normflags, namelen) != 0) { + res = B_FALSE; + } else { + res = (strcmp(zn->zn_key_norm, norm) == 0); + } + if (norm != normbuf) + kmem_free(norm, namelen); + } else { + res = (strcmp(zn->zn_key_orig, matchname) == 0); + } + return (res); +} + +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type = *(uint64_t *)buf; + + if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + } else { + fzap_byteswap(buf, size); + } +} + +void +zap_evict_sync(void *dbu) +{ + zap_t *zap = dbu; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) + mze_destroy(zap); + else + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + + kmem_free(zap, sizeof (zap_t)); +} + +static zap_attribute_t * +zap_attribute_alloc_impl(boolean_t longname) +{ + zap_attribute_t *za; + + za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, + KM_SLEEP); + za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; + return (za); +} + +zap_attribute_t * +zap_attribute_alloc(void) +{ + return (zap_attribute_alloc_impl(B_FALSE)); +} + +zap_attribute_t * +zap_attribute_long_alloc(void) +{ + return (zap_attribute_alloc_impl(B_TRUE)); +} + +void +zap_attribute_free(zap_attribute_t *za) +{ + if (za->za_name_len == ZAP_MAXNAMELEN) { + kmem_cache_free(zap_attr_cache, za); + } else { + ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); + kmem_cache_free(zap_attr_long_cache, za); + } +} diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 4e343ebf5d1..b094b113971 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -81,284 +81,7 @@ zap_get_micro_max_size(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } -static int mzap_upgrade(zap_t **zapp, - const void *tag, dmu_tx_t *tx, zap_flags_t flags); - -uint64_t -zap_getflags(zap_t *zap) -{ - if (zap->zap_ismicro) - return (0); - return (zap_f_phys(zap)->zap_flags); -} - -int -zap_hashbits(zap_t *zap) -{ - if (zap_getflags(zap) & ZAP_FLAG_HASH64) - return (48); - else - return (28); -} - -uint32_t -zap_maxcd(zap_t *zap) -{ - if (zap_getflags(zap) & ZAP_FLAG_HASH64) - return ((1<<16)-1); - else - return (-1U); -} - -static uint64_t -zap_hash(zap_name_t *zn) -{ - zap_t *zap = zn->zn_zap; - uint64_t h = 0; - - if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { - ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); - h = *(uint64_t *)zn->zn_key_orig; - } else { - h = zap->zap_salt; - ASSERT(h != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - - if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { - const uint64_t *wp = zn->zn_key_norm; - - ASSERT(zn->zn_key_intlen == 8); - for (int i = 0; i < zn->zn_key_norm_numints; - wp++, i++) { - uint64_t word = *wp; - - for (int j = 0; j < 8; j++) { - h = (h >> 8) ^ - zfs_crc64_table[(h ^ word) & 0xFF]; - word >>= NBBY; - } - } - } else { - const uint8_t *cp = zn->zn_key_norm; - - /* - * We previously stored the terminating null on - * disk, but didn't hash it, so we need to - * continue to not hash it. (The - * zn_key_*_numints includes the terminating - * null for non-binary keys.) - */ - int len = zn->zn_key_norm_numints - 1; - - ASSERT(zn->zn_key_intlen == 1); - for (int i = 0; i < len; cp++, i++) { - h = (h >> 8) ^ - zfs_crc64_table[(h ^ *cp) & 0xFF]; - } - } - } - /* - * Don't use all 64 bits, since we need some in the cookie for - * the collision differentiator. We MUST use the high bits, - * since those are the ones that we first pay attention to when - * choosing the bucket. - */ - h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); - - return (h); -} - -static int -zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, - size_t outlen) -{ - ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); - - size_t inlen = strlen(name) + 1; - - int err = 0; - (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, - normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, - U8_UNICODE_LATEST, &err); - - return (err); -} - -boolean_t -zap_match(zap_name_t *zn, const char *matchname) -{ - boolean_t res = B_FALSE; - ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); - - if (zn->zn_matchtype & MT_NORMALIZE) { - size_t namelen = zn->zn_normbuf_len; - char normbuf[ZAP_MAXNAMELEN]; - char *norm = normbuf; - - /* - * Cannot allocate this on-stack as it exceed the stack-limit of - * 1024. - */ - if (namelen > ZAP_MAXNAMELEN) - norm = kmem_alloc(namelen, KM_SLEEP); - - if (zap_normalize(zn->zn_zap, matchname, norm, - zn->zn_normflags, namelen) != 0) { - res = B_FALSE; - } else { - res = (strcmp(zn->zn_key_norm, norm) == 0); - } - if (norm != normbuf) - kmem_free(norm, namelen); - } else { - res = (strcmp(zn->zn_key_orig, matchname) == 0); - } - return (res); -} - -static kmem_cache_t *zap_name_cache; -static kmem_cache_t *zap_attr_cache; -static kmem_cache_t *zap_name_long_cache; -static kmem_cache_t *zap_attr_long_cache; - void -zap_init(void) -{ - zap_name_cache = kmem_cache_create("zap_name", - sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, - NULL, NULL, NULL, 0); - - zap_attr_cache = kmem_cache_create("zap_attr_cache", - sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, - NULL, NULL, NULL, NULL, 0); - - zap_name_long_cache = kmem_cache_create("zap_name_long", - sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, - NULL, NULL, NULL, 0); - - zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", - sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, - NULL, NULL, NULL, NULL, 0); -} - -void -zap_fini(void) -{ - kmem_cache_destroy(zap_name_cache); - kmem_cache_destroy(zap_attr_cache); - kmem_cache_destroy(zap_name_long_cache); - kmem_cache_destroy(zap_attr_long_cache); -} - -static zap_name_t * -zap_name_alloc(zap_t *zap, boolean_t longname) -{ - kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; - zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); - - zn->zn_zap = zap; - zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; - return (zn); -} - -void -zap_name_free(zap_name_t *zn) -{ - if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { - kmem_cache_free(zap_name_cache, zn); - } else { - ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); - kmem_cache_free(zap_name_long_cache, zn); - } -} - -static int -zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) -{ - zap_t *zap = zn->zn_zap; - size_t key_len = strlen(key) + 1; - - /* Make sure zn is allocated for longname if key is long */ - IMPLY(key_len > ZAP_MAXNAMELEN, - zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); - - zn->zn_key_intlen = sizeof (*key); - zn->zn_key_orig = key; - zn->zn_key_orig_numints = key_len; - zn->zn_matchtype = mt; - zn->zn_normflags = zap->zap_normflags; - - /* - * If we're dealing with a case sensitive lookup on a mixed or - * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup - * will fold case to all caps overriding the lookup request. - */ - if (mt & MT_MATCH_CASE) - zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; - - if (zap->zap_normflags) { - /* - * We *must* use zap_normflags because this normalization is - * what the hash is computed from. - */ - if (zap_normalize(zap, key, zn->zn_normbuf, - zap->zap_normflags, zn->zn_normbuf_len) != 0) - return (SET_ERROR(ENOTSUP)); - zn->zn_key_norm = zn->zn_normbuf; - zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; - } else { - if (mt != 0) - return (SET_ERROR(ENOTSUP)); - zn->zn_key_norm = zn->zn_key_orig; - zn->zn_key_norm_numints = zn->zn_key_orig_numints; - } - - zn->zn_hash = zap_hash(zn); - - if (zap->zap_normflags != zn->zn_normflags) { - /* - * We *must* use zn_normflags because this normalization is - * what the matching is based on. (Not the hash!) - */ - if (zap_normalize(zap, key, zn->zn_normbuf, - zn->zn_normflags, zn->zn_normbuf_len) != 0) - return (SET_ERROR(ENOTSUP)); - zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; - } - - return (0); -} - -zap_name_t * -zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) -{ - size_t key_len = strlen(key) + 1; - zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); - if (zap_name_init_str(zn, key, mt) != 0) { - zap_name_free(zn); - return (NULL); - } - return (zn); -} - -static zap_name_t * -zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) -{ - zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); - - ASSERT0(zap->zap_normflags); - zn->zn_zap = zap; - zn->zn_key_intlen = sizeof (*key); - zn->zn_key_orig = zn->zn_key_norm = key; - zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; - zn->zn_matchtype = 0; - zn->zn_normbuf_len = ZAP_MAXNAMELEN; - - zn->zn_hash = zap_hash(zn); - return (zn); -} - -static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); @@ -373,19 +96,6 @@ mzap_byteswap(mzap_phys_t *buf, size_t size) } } -void -zap_byteswap(void *buf, size_t size) -{ - uint64_t block_type = *(uint64_t *)buf; - - if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { - /* ASSERT(magic == ZAP_LEAF_MAGIC); */ - mzap_byteswap(buf, size); - } else { - fzap_byteswap(buf, size); - } -} - __attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) @@ -417,7 +127,7 @@ mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) zfs_btree_add(&zap->zap_m.zap_tree, &mze); } -static mzap_ent_t * +mzap_ent_t * mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; @@ -482,7 +192,7 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) * Check if the current entry keeps the colliding entries under the fatzap leaf * size. */ -static boolean_t +boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; @@ -508,14 +218,14 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); } -static void +void mze_destroy(zap_t *zap) { zfs_btree_clear(&zap->zap_m.zap_tree); zfs_btree_destroy(&zap->zap_m.zap_tree); } -static zap_t * +zap_t * mzap_open(dmu_buf_t *db) { zap_t *winner; @@ -614,162 +324,7 @@ mzap_open(dmu_buf_t *db) return (winner); } -/* - * This routine "consumes" the caller's hold on the dbuf, which must - * have the specified tag. - */ -static int -zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) -{ - ASSERT0(db->db_offset); - objset_t *os = dmu_buf_get_objset(db); - uint64_t obj = db->db_object; - - *zapp = NULL; - - if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP) - return (SET_ERROR(EINVAL)); - - zap_t *zap = dmu_buf_get_user(db); - if (zap == NULL) { - zap = mzap_open(db); - if (zap == NULL) { - /* - * mzap_open() didn't like what it saw on-disk. - * Check for corruption! - */ - return (SET_ERROR(EIO)); - } - } - - /* - * We're checking zap_ismicro without the lock held, in order to - * tell what type of lock we want. Once we have some sort of - * lock, see if it really is the right type. In practice this - * can only be different if it was upgraded from micro to fat, - * and micro wanted WRITER but fat only needs READER. - */ - krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; - rw_enter(&zap->zap_rwlock, lt); - if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { - /* it was upgraded, now we only need reader */ - ASSERT(lt == RW_WRITER); - ASSERT(RW_READER == - ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); - rw_downgrade(&zap->zap_rwlock); - lt = RW_READER; - } - - zap->zap_objset = os; - zap->zap_dnode = dn; - - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - - ASSERT3P(zap->zap_dbuf, ==, db); - - ASSERT(!zap->zap_ismicro || - zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); - if (zap->zap_ismicro && tx && adding && - zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { - uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { - dprintf("upgrading obj %llu: num_entries=%u\n", - (u_longlong_t)obj, zap->zap_m.zap_num_entries); - *zapp = zap; - int err = mzap_upgrade(zapp, tag, tx, 0); - if (err != 0) - rw_exit(&zap->zap_rwlock); - return (err); - } - VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); - zap->zap_m.zap_num_chunks = - db->db_size / MZAP_ENT_LEN - 1; - - if (newsz > SPA_OLD_MAXBLOCKSIZE) { - dsl_dataset_t *ds = dmu_objset_ds(os); - if (!dsl_dataset_feature_is_active(ds, - SPA_FEATURE_LARGE_MICROZAP)) { - /* - * A microzap just grew beyond the old limit - * for the first time, so we have to ensure the - * feature flag is activated. - * zap_get_micro_max_size() won't let us get - * here if the feature is not enabled, so we - * don't need any other checks beforehand. - * - * Since we're in open context, we can't - * activate the feature directly, so we instead - * flag it on the dataset for next sync. - */ - dsl_dataset_dirty(ds, tx); - mutex_enter(&ds->ds_lock); - ds->ds_feature_activation - [SPA_FEATURE_LARGE_MICROZAP] = - (void *)B_TRUE; - mutex_exit(&ds->ds_lock); - } - } - } - - *zapp = zap; - return (0); -} - -static int -zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, - zap_t **zapp) -{ - dmu_buf_t *db; - int err; - - err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) - dmu_buf_rele(db, tag); - else - VERIFY(dnode_add_ref(dn, tag)); - return (err); -} - int -zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, - zap_t **zapp) -{ - dnode_t *dn; - dmu_buf_t *db; - int err; - - err = dnode_hold(os, obj, tag, &dn); - if (err != 0) - return (err); - err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { - dnode_rele(dn, tag); - return (err); - } - err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { - dmu_buf_rele(db, tag); - dnode_rele(dn, tag); - } - return (err); -} - -void -zap_unlockdir(zap_t *zap, const void *tag) -{ - rw_exit(&zap->zap_rwlock); - dnode_rele(zap->zap_dnode, tag); - dmu_buf_rele(zap->zap_dbuf, tag); -} - -static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; @@ -861,217 +416,11 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) } } -static uint64_t -zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) -{ - uint64_t obj; - - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - - if (allocated_dnode == NULL) { - dnode_t *dn; - obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, - &dn, FTAG, tx); - mzap_create_impl(dn, normflags, flags, tx); - dnode_rele(dn, FTAG); - } else { - obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, - allocated_dnode, tag, tx); - mzap_create_impl(*allocated_dnode, normflags, flags, tx); - } - - return (obj); -} - -int -zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, - 0, tx)); -} - -int -zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_claim_norm_dnsize(os, obj, - 0, ot, bonustype, bonuslen, dnodesize, tx)); -} - -int -zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, - dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, - bonuslen, 0, tx)); -} - -int -zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, - dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx) -{ - dnode_t *dn; - int error; - - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, - dnodesize, tx); - if (error != 0) - return (error); - - error = dnode_hold(os, obj, FTAG, &dn); - if (error != 0) - return (error); - - mzap_create_impl(dn, normflags, 0, tx); - - dnode_rele(dn, FTAG); - - return (0); -} - -uint64_t -zap_create(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); -} - -uint64_t -zap_create_dnsize(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, - dnodesize, tx)); -} - -uint64_t -zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, - 0, tx)); -} - -uint64_t -zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_impl(os, normflags, 0, ot, 0, 0, - bonustype, bonuslen, dnodesize, NULL, NULL, tx)); -} - -uint64_t -zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_flags_dnsize(os, normflags, flags, ot, - leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); -} - -uint64_t -zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, - tx)); -} - -/* - * Create a zap object and return a pointer to the newly allocated dnode via - * the allocated_dnode argument. The returned dnode will be held and the - * caller is responsible for releasing the hold by calling dnode_rele(). - */ -uint64_t -zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) -{ - return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, - allocated_dnode, tag, tx)); -} - -int -zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) -{ - /* - * dmu_object_free will free the object number and free the - * data. Freeing the data will cause our pageout function to be - * called, which will destroy our data (zap_leaf_t's and zap_t). - */ - - return (dmu_object_free(os, zapobj, tx)); -} - -void -zap_evict_sync(void *dbu) -{ - zap_t *zap = dbu; - - rw_destroy(&zap->zap_rwlock); - - if (zap->zap_ismicro) - mze_destroy(zap); - else - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - - kmem_free(zap, sizeof (zap_t)); -} - -int -zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - if (!zap->zap_ismicro) { - err = fzap_count(zap, count); - } else { - *count = zap->zap_m.zap_num_entries; - } - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_count_by_dnode(dnode_t *dn, uint64_t *count) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - if (!zap->zap_ismicro) { - err = fzap_count(zap, count); - } else { - *count = zap->zap_m.zap_num_entries; - } - zap_unlockdir(zap, FTAG); - return (err); -} - /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ -static boolean_t +boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, zfs_btree_index_t *idx) { @@ -1119,340 +468,7 @@ mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, return (B_FALSE); } -/* - * Routines for manipulating attributes. - */ - -int -zap_lookup(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - return (zap_lookup_norm(os, zapobj, name, integer_size, - num_integers, buf, 0, NULL, 0, NULL)); -} - -static int -zap_lookup_impl(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - int err = 0; - - zap_name_t *zn = zap_name_alloc_str(zap, name, mt); - if (zn == NULL) - return (SET_ERROR(ENOTSUP)); - - if (!zap->zap_ismicro) { - err = fzap_lookup(zn, integer_size, num_integers, buf, - realname, rn_len, ncp, NULL); - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - if (num_integers < 1) { - err = SET_ERROR(EOVERFLOW); - } else if (integer_size != 8) { - err = SET_ERROR(EINVAL); - } else { - *(uint64_t *)buf = - MZE_PHYS(zap, mze)->mze_value; - if (realname != NULL) - (void) strlcpy(realname, - MZE_PHYS(zap, mze)->mze_name, - rn_len); - if (ncp) { - *ncp = mzap_normalization_conflict(zap, - zn, mze, &idx); - } - } - } - } - zap_name_free(zn); - return (err); -} - -int -zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_impl(zap, name, integer_size, - num_integers, buf, mt, realname, rn_len, ncp); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) -{ - zap_t *zap; - int err; - zap_name_t *zn; - - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - zn = zap_name_alloc_str(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - fzap_prefetch(zn); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_prefetch_object(objset_t *os, uint64_t zapobj) -{ - int error; - dmu_object_info_t doi; - - error = dmu_object_info(os, zapobj, &doi); - if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) - error = SET_ERROR(EINVAL); - if (error == 0) - dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); - - return (error); -} - -int -zap_lookup_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - return (zap_lookup_norm_by_dnode(dn, name, integer_size, - num_integers, buf, 0, NULL, 0, NULL)); -} - -int -zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_impl(zap, name, integer_size, - num_integers, buf, mt, realname, rn_len, ncp); - zap_unlockdir(zap, FTAG); - return (err); -} - -static int -zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) -{ - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - fzap_prefetch(zn); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (0); -} - -int -zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_prefetch_uint64_impl(zap, key, key_numints); - /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_prefetch_uint64_impl(zap, key, key_numints); - /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -static int -zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, - uint64_t *actual_num_integers) -{ - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - int err = fzap_lookup(zn, integer_size, num_integers, buf, - NULL, 0, NULL, actual_num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_length_uint64_impl(zap, key, key_numints, - integer_size, num_integers, buf, NULL); - /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_length_uint64_impl(zap, key, key_numints, - integer_size, num_integers, buf, NULL); - /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, - uint64_t *actual_num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_length_uint64_impl(zap, key, key_numints, - integer_size, num_integers, buf, actual_num_integers); - /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_contains(objset_t *os, uint64_t zapobj, const char *name) -{ - int err = zap_lookup_norm(os, zapobj, name, 0, - 0, NULL, 0, NULL, 0, NULL); - if (err == EOVERFLOW || err == EINVAL) - err = 0; /* found, but skipped reading the value */ - return (err); -} - -int -zap_length(objset_t *os, uint64_t zapobj, const char *name, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_str(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_length(zn, integer_size, num_integers); - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - if (integer_size) - *integer_size = 8; - if (num_integers) - *num_integers = 1; - } - } - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_length(zn, integer_size, num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_length(zn, integer_size, num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -static void +void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; @@ -1495,612 +511,6 @@ mzap_addent(zap_name_t *zn, uint64_t value) cmn_err(CE_PANIC, "out of entries!"); } -static int -zap_add_impl(zap_t *zap, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx, const void *tag) -{ - const uint64_t *intval = val; - int err = 0; - - zap_name_t *zn = zap_name_alloc_str(zap, key, 0); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - } else if (integer_size != 8 || num_integers != 1 || - strlen(key) >= MZAP_NAME_LEN || - !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { - err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); - if (err == 0) { - err = fzap_add(zn, integer_size, num_integers, val, - tag, tx); - } - zap = zn->zn_zap; /* fzap_add() may change zap */ - } else { - zfs_btree_index_t idx; - if (mze_find(zn, &idx) != NULL) { - err = SET_ERROR(EEXIST); - } else { - mzap_addent(zn, *intval); - } - } - ASSERT(zap == zn->zn_zap); - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_add(objset_t *os, uint64_t zapobj, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); - /* zap_add_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_add_by_dnode(dnode_t *dn, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); - /* zap_add_impl() calls zap_unlockdir() */ - return (err); -} - -static int -zap_add_uint64_impl(zap_t *zap, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx, const void *tag) -{ - int err; - - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_add_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_add_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_update(objset_t *os, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - const uint64_t *intval = val; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_str(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_update(zn, integer_size, num_integers, val, - FTAG, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - (u_longlong_t)zapobj, integer_size, - (u_longlong_t)num_integers, name); - err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); - if (err == 0) { - err = fzap_update(zn, integer_size, num_integers, - val, FTAG, tx); - } - zap = zn->zn_zap; /* fzap_update() may change zap */ - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze != NULL) { - MZE_PHYS(zap, mze)->mze_value = *intval; - } else { - mzap_addent(zn, *intval); - } - } - ASSERT(zap == zn->zn_zap); - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); - return (err); -} - -static int -zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, - const void *tag) -{ - int err; - - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_update(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, const void *val, - dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_update_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_update_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_update_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_update_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) -{ - return (zap_remove_norm(os, zapobj, name, 0, tx)); -} - -static int -zap_remove_impl(zap_t *zap, const char *name, - matchtype_t mt, dmu_tx_t *tx) -{ - int err = 0; - - zap_name_t *zn = zap_name_alloc_str(zap, name, mt); - if (zn == NULL) - return (SET_ERROR(ENOTSUP)); - if (!zap->zap_ismicro) { - err = fzap_remove(zn, tx); - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - zap->zap_m.zap_num_entries--; - memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); - zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); - } - } - zap_name_free(zn); - return (err); -} - -int -zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - matchtype_t mt, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - err = zap_remove_impl(zap, name, mt, tx); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - err = zap_remove_impl(zap, name, 0, tx); - zap_unlockdir(zap, FTAG); - return (err); -} - -static int -zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, - dmu_tx_t *tx, const void *tag) -{ - int err; - - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); - /* zap_remove_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, - dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); - /* zap_remove_uint64_impl() calls zap_unlockdir() */ - return (err); -} - - -static zap_attribute_t * -zap_attribute_alloc_impl(boolean_t longname) -{ - zap_attribute_t *za; - - za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, - KM_SLEEP); - za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; - return (za); -} - -zap_attribute_t * -zap_attribute_alloc(void) -{ - return (zap_attribute_alloc_impl(B_FALSE)); -} - -zap_attribute_t * -zap_attribute_long_alloc(void) -{ - return (zap_attribute_alloc_impl(B_TRUE)); -} - -void -zap_attribute_free(zap_attribute_t *za) -{ - if (za->za_name_len == ZAP_MAXNAMELEN) { - kmem_cache_free(zap_attr_cache, za); - } else { - ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); - kmem_cache_free(zap_attr_long_cache, za); - } -} - -/* - * Routines for iterating over the attributes. - */ - -static void -zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized, boolean_t prefetch) -{ - zc->zc_objset = os; - zc->zc_zap = NULL; - zc->zc_leaf = NULL; - zc->zc_zapobj = zapobj; - zc->zc_serialized = serialized; - zc->zc_hash = 0; - zc->zc_cd = 0; - zc->zc_prefetch = prefetch; -} -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) -{ - zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); -} - -/* - * Initialize a cursor at the beginning of the ZAP object. The entire - * ZAP object will be prefetched. - */ -void -zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); -} - -/* - * Initialize a cursor at the beginning, but request that we not prefetch - * the entire ZAP object. - */ -void -zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); -} - -void -zap_cursor_fini(zap_cursor_t *zc) -{ - if (zc->zc_zap) { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlockdir(zc->zc_zap, NULL); - zc->zc_zap = NULL; - } - if (zc->zc_leaf) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - zc->zc_objset = NULL; -} - -uint64_t -zap_cursor_serialize(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return (-1ULL); - if (zc->zc_zap == NULL) - return (zc->zc_serialized); - ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap))); - ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); - - /* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So usually use a small - * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits - * of the cursor. - * - * [ collision differentiator | zap_hashbits()-bit hash value ] - */ - return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | - ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); -} - -int -zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) -{ - int err; - - if (zc->zc_hash == -1ULL) - return (SET_ERROR(ENOENT)); - - if (zc->zc_zap == NULL) { - int hb; - err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); - if (err != 0) - return (err); - - /* - * To support zap_cursor_init_serialized, advance, retrieve, - * we must add to the existing zc_cd, which may already - * be 1 due to the zap_cursor_advance. - */ - ASSERT0(zc->zc_hash); - hb = zap_hashbits(zc->zc_zap); - zc->zc_hash = zc->zc_serialized << (64 - hb); - zc->zc_cd += zc->zc_serialized >> hb; - if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ - zc->zc_cd = 0; - } else { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - } - if (!zc->zc_zap->zap_ismicro) { - err = fzap_cursor_retrieve(zc->zc_zap, zc, za); - } else { - zfs_btree_index_t idx; - mzap_ent_t mze_tofind; - - mze_tofind.mze_hash = zc->zc_hash >> 32; - mze_tofind.mze_cd = zc->zc_cd; - - mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, - &mze_tofind, &idx); - if (mze == NULL) { - mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, - &idx, &idx); - } - if (mze) { - mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); - ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); - za->za_normalization_conflict = - mzap_normalization_conflict(zc->zc_zap, NULL, - mze, &idx); - za->za_integer_length = 8; - za->za_num_integers = 1; - za->za_first_integer = mzep->mze_value; - (void) strlcpy(za->za_name, mzep->mze_name, - za->za_name_len); - zc->zc_hash = (uint64_t)mze->mze_hash << 32; - zc->zc_cd = mze->mze_cd; - err = 0; - } else { - zc->zc_hash = -1ULL; - err = SET_ERROR(ENOENT); - } - } - rw_exit(&zc->zc_zap->zap_rwlock); - return (err); -} - -void -zap_cursor_advance(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return; - zc->zc_cd++; -} - -int -zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - - memset(zs, 0, sizeof (zap_stats_t)); - - if (zap->zap_ismicro) { - zs->zs_blocksize = zap->zap_dbuf->db_size; - zs->zs_num_entries = zap->zap_m.zap_num_entries; - zs->zs_num_blocks = 1; - } else { - fzap_get_stats(zap, zs); - } - zap_unlockdir(zap, FTAG); - return (0); -} - -#if defined(_KERNEL) -EXPORT_SYMBOL(zap_create); -EXPORT_SYMBOL(zap_create_dnsize); -EXPORT_SYMBOL(zap_create_norm); -EXPORT_SYMBOL(zap_create_norm_dnsize); -EXPORT_SYMBOL(zap_create_flags); -EXPORT_SYMBOL(zap_create_flags_dnsize); -EXPORT_SYMBOL(zap_create_claim); -EXPORT_SYMBOL(zap_create_claim_norm); -EXPORT_SYMBOL(zap_create_claim_norm_dnsize); -EXPORT_SYMBOL(zap_create_hold); -EXPORT_SYMBOL(zap_destroy); -EXPORT_SYMBOL(zap_lookup); -EXPORT_SYMBOL(zap_lookup_by_dnode); -EXPORT_SYMBOL(zap_lookup_norm); -EXPORT_SYMBOL(zap_lookup_uint64); -EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode); -EXPORT_SYMBOL(zap_contains); -EXPORT_SYMBOL(zap_prefetch); -EXPORT_SYMBOL(zap_prefetch_uint64); -EXPORT_SYMBOL(zap_prefetch_object); -EXPORT_SYMBOL(zap_add); -EXPORT_SYMBOL(zap_add_by_dnode); -EXPORT_SYMBOL(zap_add_uint64); -EXPORT_SYMBOL(zap_add_uint64_by_dnode); -EXPORT_SYMBOL(zap_update); -EXPORT_SYMBOL(zap_update_uint64); -EXPORT_SYMBOL(zap_update_uint64_by_dnode); -EXPORT_SYMBOL(zap_length); -EXPORT_SYMBOL(zap_length_uint64); -EXPORT_SYMBOL(zap_length_uint64_by_dnode); -EXPORT_SYMBOL(zap_remove); -EXPORT_SYMBOL(zap_remove_by_dnode); -EXPORT_SYMBOL(zap_remove_norm); -EXPORT_SYMBOL(zap_remove_uint64); -EXPORT_SYMBOL(zap_remove_uint64_by_dnode); -EXPORT_SYMBOL(zap_count); -EXPORT_SYMBOL(zap_count_by_dnode); -EXPORT_SYMBOL(zap_value_search); -EXPORT_SYMBOL(zap_join); -EXPORT_SYMBOL(zap_join_increment); -EXPORT_SYMBOL(zap_add_int); -EXPORT_SYMBOL(zap_remove_int); -EXPORT_SYMBOL(zap_lookup_int); -EXPORT_SYMBOL(zap_increment_int); -EXPORT_SYMBOL(zap_add_int_key); -EXPORT_SYMBOL(zap_lookup_int_key); -EXPORT_SYMBOL(zap_increment); -EXPORT_SYMBOL(zap_cursor_init); -EXPORT_SYMBOL(zap_cursor_fini); -EXPORT_SYMBOL(zap_cursor_retrieve); -EXPORT_SYMBOL(zap_cursor_advance); -EXPORT_SYMBOL(zap_cursor_serialize); -EXPORT_SYMBOL(zap_cursor_init_serialized); -EXPORT_SYMBOL(zap_get_stats); - ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, "Maximum micro ZAP size before converting to a fat ZAP, " "in bytes (max 1M)"); -#endif