From 9f346abbe8645c58d8adb778c419bb5422c83cf4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Aug 2022 13:33:32 -0700 Subject: [PATCH 01/69] Revert "Avoid panic with recordsize > 128k, raw sending and no large_blocks" This reverts commit 80a650b7bb04bce3aef5e4cfd1d966e3599dafd4. This change inadvertently introduced a regression in ztest where one of the new ASSERTs is triggered in dsl_scan_visitbp(). Reviewed-by: George Amanakis Signed-off-by: Brian Behlendorf Issue #12275 Closes #13799 --- include/sys/dsl_dataset.h | 1 - lib/libzfs/libzfs_sendrecv.c | 10 -------- module/zfs/dmu_objset.c | 10 -------- module/zfs/dmu_send.c | 4 ---- module/zfs/dsl_dataset.c | 46 ++++++++++++++++-------------------- module/zfs/dsl_scan.c | 15 ------------ 6 files changed, 20 insertions(+), 66 deletions(-) diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 36307c63151..81d25da831b 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -375,7 +375,6 @@ boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx); -void dsl_dataset_feature_set_activation(const blkptr_t *bp, dsl_dataset_t *ds); void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 640051e3b02..577ebf6aad4 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -874,11 +874,6 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, case EINVAL: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "large blocks detected but large_blocks feature " - "is inactive; raw send unsupported")); - return (zfs_error(hdl, EZFS_NOTSUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); @@ -2702,11 +2697,6 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "large blocks detected but large_blocks feature " - "is inactive; raw send unsupported")); - return (zfs_error(hdl, EZFS_NOTSUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index b9e16f79efc..4c20afcdb9c 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1695,16 +1695,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - /* - * In the codepath dsl_dataset_sync()->dmu_objset_sync() we cannot - * rely on the zio above completing and calling back - * dmu_objset_write_done()->dsl_dataset_block_born() before - * dsl_dataset_sync() actually activates feature flags near its end. - * Decide here if any features need to be activated, before - * dsl_dataset_sync() completes its run. - */ - dsl_dataset_feature_set_activation(blkptr_copy, os->os_dsl_dataset); - /* * Sync special dnodes - the parent IO for the sync is the root block */ diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 5e6ced2bb30..283e2d3b37b 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -493,7 +493,6 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && io_compressed : lsize != psize); if (raw || compressed) { - ASSERT(bp != NULL); ASSERT(raw || dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); @@ -1018,9 +1017,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && !(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) { - if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) - return (SET_ERROR(ENOTSUP)); - while (srdp->datablksz > 0 && err == 0) { int n = MIN(srdp->datablksz, SPA_OLD_MAXBLOCKSIZE); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 21fef4c621b..8f3240a5deb 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -132,30 +132,6 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) return (new_bytes - old_bytes); } -void -dsl_dataset_feature_set_activation(const blkptr_t *bp, dsl_dataset_t *ds) -{ - spa_feature_t f; - if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { - ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] = - (void *)B_TRUE; - } - - f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); - if (f != SPA_FEATURE_NONE) { - ASSERT3S(spa_feature_table[f].fi_type, ==, - ZFEATURE_TYPE_BOOLEAN); - ds->ds_feature_activation[f] = (void *)B_TRUE; - } - - f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); - if (f != SPA_FEATURE_NONE) { - ASSERT3S(spa_feature_table[f].fi_type, ==, - ZFEATURE_TYPE_BOOLEAN); - ds->ds_feature_activation[f] = (void *)B_TRUE; - } -} - void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { @@ -164,6 +140,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; + spa_feature_t f; dprintf_bp(bp, "ds=%p", ds); @@ -188,7 +165,25 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; - dsl_dataset_feature_set_activation(bp, ds); + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { + ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] = + (void *)B_TRUE; + } + + + f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } + + f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } /* * Track block for livelist, but ignore embedded blocks because @@ -5027,4 +5022,3 @@ EXPORT_SYMBOL(dsl_dsobj_to_dsname); EXPORT_SYMBOL(dsl_dataset_check_quota); EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); -EXPORT_SYMBOL(dsl_dataset_feature_set_activation); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 2b76bed1b69..28afc3dead7 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2008,21 +2008,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, return; } - /* - * Check if this block contradicts any filesystem flags. - */ - spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS; - if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) - ASSERT3B(dsl_dataset_feature_is_active(ds, f), ==, B_TRUE); - - f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); - if (f != SPA_FEATURE_NONE) - ASSERT3B(dsl_dataset_feature_is_active(ds, f), ==, B_TRUE); - - f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); - if (f != SPA_FEATURE_NONE) - ASSERT3B(dsl_dataset_feature_is_active(ds, f), ==, B_TRUE); - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; From 5bc0318047d9fbd6b740299df1cd3188285d9004 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 25 Aug 2022 23:22:10 +0200 Subject: [PATCH 02/69] ZTS: zvol_stress: fix race condition with zinject usage In automated ZTS runs, I'd occasionally hit log_fail "Expected to see some write errors" because there weren't any write errors. The reason is that we're not syncing the zpool before `zinject -c`. If the writes by `dd` aren't synced out at the time `zinject -c` runs, they will not hit an error and we'll hit the log_fail above. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Christian Schwarz Closes #13793 --- .../zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh index 883d9984be4..3431d33d97d 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -151,6 +151,7 @@ for DISK in $DISKS ; do log_must zinject -d $DISK -f 10 -e io -T write $TESTPOOL done log_must dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=512 count=50 +sync_pool $TESTPOOL log_must zinject -c all # We should see write errors From 2d5622f5be15e9e977a4c8fe5d24baaf487b0432 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 26 Aug 2022 16:04:27 -0500 Subject: [PATCH 03/69] Importing from cachefile can trip assertion When importing from cachefile, it is possible that the builtin retry logic will trip an assertion because it also fails to find the pool. This fix addresses that case and returns the correct error message to the user. Reviewed-by: Richard Yao Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Brian Behlendorf Signed-off-by: George Wilson Closes #13781 --- lib/libzutil/zutil_import.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 0bbf232f24a..252b0bac685 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1694,6 +1694,8 @@ zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) * caller. */ nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + if (pair == NULL) + continue; fnvlist_add_nvlist(pools, nvpair_name(pair), fnvpair_value_nvlist(pair)); From 58e8054bce3f493cc1f38f7177cfdb942fa4deb9 Mon Sep 17 00:00:00 2001 From: Andrew Innes Date: Sat, 3 Sep 2022 04:15:18 +0800 Subject: [PATCH 04/69] Alloc zdb_cd_t to fix stack issue Alloc zdb_cd_t since it is too large for the stack on windows which results in `zdb` crashing immediately. Reviewed-by: Brian Behlendorf Signed-off-by: Andrew Innes Co-authored-by: Jorgen Lundman Closes #13807 --- cmd/zdb/zdb.c | 81 ++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index fdf569691cb..5389520e803 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -6415,7 +6415,7 @@ deleted_livelists_dump_mos(spa_t *spa) static int dump_block_stats(spa_t *spa) { - zdb_cb_t zcb = {{{{0}}}}; + zdb_cb_t *zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | @@ -6424,6 +6424,8 @@ dump_block_stats(spa_t *spa) int e, c, err; bp_embedded_type_t i; + zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", @@ -6443,39 +6445,39 @@ dump_block_stats(spa_t *spa) * pool claiming each block we discover, but we skip opening any space * maps. */ - zdb_leak_init(spa, &zcb); + zdb_leak_init(spa, zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, - bpobj_count_block_cb, &zcb, NULL); + bpobj_count_block_cb, zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, - bpobj_count_block_cb, &zcb, NULL); + bpobj_count_block_cb, zcb, NULL); } - zdb_claim_removing(spa, &zcb); + zdb_claim_removing(spa, zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, - &zcb, NULL)); + zcb, NULL)); } - deleted_livelists_count_blocks(spa, &zcb); + deleted_livelists_count_blocks(spa, zcb); if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; - zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); - zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); - zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); - zcb.zcb_totalasize += + zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); + zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb->zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); - zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); - err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); + err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); /* * If we've traversed the data blocks then we need to wait for those @@ -6496,15 +6498,15 @@ dump_block_stats(spa_t *spa) * Done after zio_wait() since zcb_haderrors is modified in * zdb_blkptr_done() */ - zcb.zcb_haderrors |= err; + zcb->zcb_haderrors |= err; - if (zcb.zcb_haderrors) { + if (zcb->zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (e = 0; e < 256; e++) { - if (zcb.zcb_errors[e] != 0) { + if (zcb->zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", - e, (u_longlong_t)zcb.zcb_errors[e]); + e, (u_longlong_t)zcb->zcb_errors[e]); } } } @@ -6512,9 +6514,9 @@ dump_block_stats(spa_t *spa) /* * Report any leaked segments. */ - leaks |= zdb_leak_fini(spa, &zcb); + leaks |= zdb_leak_fini(spa, zcb); - tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; + tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); @@ -6525,8 +6527,8 @@ dump_block_stats(spa_t *spa) metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); - total_found = tzb->zb_asize - zcb.zcb_dedup_asize + - zcb.zcb_removing_size + zcb.zcb_checkpoint_size; + total_found = tzb->zb_asize - zcb->zcb_dedup_asize + + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { (void) printf("\n\tNo leaks (block sum matches space" @@ -6541,8 +6543,10 @@ dump_block_stats(spa_t *spa) leaks = B_TRUE; } - if (tzb->zb_count == 0) + if (tzb->zb_count == 0) { + umem_free(zcb, sizeof (zdb_cb_t)); return (2); + } (void) printf("\n"); (void) printf("\t%-16s %14llu\n", "bp count:", @@ -6561,9 +6565,9 @@ dump_block_stats(spa_t *spa) (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", - "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, - (u_longlong_t)zcb.zcb_dedup_blocks, - (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); + "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, + (u_longlong_t)zcb->zcb_dedup_blocks, + (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); @@ -6601,19 +6605,19 @@ dump_block_stats(spa_t *spa) } for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { - if (zcb.zcb_embedded_blocks[i] == 0) + if (zcb->zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", - i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); + i, (u_longlong_t)zcb->zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); - dump_histogram(zcb.zcb_embedded_histogram[i], - sizeof (zcb.zcb_embedded_histogram[i]) / - sizeof (zcb.zcb_embedded_histogram[i][0]), 0); + dump_histogram(zcb->zcb_embedded_histogram[i], + sizeof (zcb->zcb_embedded_histogram[i]) / + sizeof (zcb->zcb_embedded_histogram[i][0]), 0); } } @@ -6673,7 +6677,7 @@ dump_block_stats(spa_t *spa) else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; - if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { + if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", @@ -6689,7 +6693,7 @@ dump_block_stats(spa_t *spa) for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); - zb = &zcb.zcb_type[level][t]; + zb = &zcb->zcb_type[level][t]; if (zb->zb_asize == 0) continue; @@ -6698,7 +6702,7 @@ dump_block_stats(spa_t *spa) continue; if (level == 0 && zb->zb_asize == - zcb.zcb_type[ZB_TOTAL][t].zb_asize) + zcb->zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, @@ -6742,18 +6746,23 @@ dump_block_stats(spa_t *spa) /* Output a table summarizing block sizes in the pool */ if (dump_opt['b'] >= 2) { - dump_size_histograms(&zcb); + dump_size_histograms(zcb); } } (void) printf("\n"); - if (leaks) + if (leaks) { + umem_free(zcb, sizeof (zdb_cb_t)); return (2); + } - if (zcb.zcb_haderrors) + if (zcb->zcb_haderrors) { + umem_free(zcb, sizeof (zdb_cb_t)); return (3); + } + umem_free(zcb, sizeof (zdb_cb_t)); return (0); } From 0b30dc484f7e70bc8bfe53fefc8581d181044efa Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 2 Sep 2022 16:20:10 -0400 Subject: [PATCH 05/69] FreeBSD: Cleanup dead code from VFS The vfs_*_feature() macros turn anything that uses them into dead code, so we can delete all of it. As a side effect, zfs_set_fuid_feature() is now identical in module/os/freebsd/zfs/zfs_vnops_os.c and module/os/linux/zfs/zfs_vnops_os.c. A few other functions are identical too. Future cleanup could move these into a common file. Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13832 --- include/os/freebsd/spl/sys/vfs.h | 4 ---- module/os/freebsd/zfs/zfs_vfsops.c | 26 -------------------------- module/os/freebsd/zfs/zfs_vnops_os.c | 23 ----------------------- 3 files changed, 53 deletions(-) diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h index 22d57cc473e..7f163fcfdb1 100644 --- a/include/os/freebsd/spl/sys/vfs.h +++ b/include/os/freebsd/spl/sys/vfs.h @@ -117,9 +117,5 @@ typedef uint64_t vfs_feature_t; #define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 /* Support loaning /returning cache buffer */ -#define vfs_set_feature(vfsp, feature) do { } while (0) -#define vfs_clear_feature(vfsp, feature) do { } while (0) -#define vfs_has_feature(vfsp, feature) (0) - #include #endif /* _OPENSOLARIS_SYS_VFS_H_ */ diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 24e06b1a880..4e4a5f8d215 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -1151,23 +1151,6 @@ static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_vfs) { - if (zfsvfs->z_use_fuids) { - vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); - } else { - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); - } - } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } @@ -1226,15 +1209,6 @@ zfs_domount(vfs_t *vfsp, char *osname) * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); - vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); - vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); - } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { - vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); - vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); - } - vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index b46cc550c78..f0579626c5a 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1672,7 +1672,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, int outcount; int error; uint8_t prefetch; - boolean_t check_sysattrs; uint8_t type; int ncooks; cookie_t *cooks = NULL; @@ -1756,19 +1755,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, *cookies = cooks; *ncookies = ncooks; } - /* - * If this VFS supports the system attribute view interface; and - * we're looking at an extended attribute directory; and we care - * about normalization conflicts on this vfs; then we must check - * for normalization conflicts with the sysattr name space. - */ -#ifdef TODO - check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && - (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && - (flags & V_RDDIR_ENTFLAGS); -#else - check_sysattrs = 0; -#endif /* * Transform to file-system independent format @@ -1824,15 +1810,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); - - if (check_sysattrs && !zap.za_normalization_conflict) { -#ifdef TODO - zap.za_normalization_conflict = - xattr_sysattr_casechk(zap.za_name); -#else - panic("%s:%u: TODO", __func__, __LINE__); -#endif - } } if (flags & V_RDDIR_ACCFILTER) { From f933b3fd4dda8b37aa37aeae05951b76f51ddae7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 2 Sep 2022 16:21:18 -0400 Subject: [PATCH 06/69] Apply arc_shrink_shift to ARC above arc_c_min It makes sense to free memory in smaller chunks when approaching arc_c_min to let other kernel subsystems to free more, since after that point we can't free anything. This also matches behavior on Linux, where to shrinker reported only the size above arc_c_min. Reviewed-by: Ryan Moeller Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #13794 --- module/os/freebsd/zfs/arc_os.c | 5 ++++- module/zfs/arc.c | 9 +++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index ca2bf884257..dbd71ea43fd 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -221,7 +221,10 @@ arc_lowmem(void *arg __unused, int howto __unused) arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); free_memory = arc_available_memory(); - to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + int64_t can_free = arc_c - arc_c_min; + if (can_free <= 0) + return; + to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 579e78befe1..980dc60d0cc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5051,10 +5051,11 @@ arc_reap_cb(void *arg, zthr_t *zthr) */ free_memory = arc_available_memory(); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { - arc_reduce_target_size(to_free); + int64_t can_free = arc_c - arc_c_min; + if (can_free > 0) { + int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; + if (to_free > 0) + arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } From 899355d293830f250e46d6b651db5afed08b91ea Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Sat, 3 Sep 2022 01:24:07 +0500 Subject: [PATCH 07/69] Add zilstat script to report zil kstats in a user friendly manner Added a python script to process both global and per dataset zil kstats and report them in a user friendly manner similar to arcstat and dbufstat. Reviewed-by: George Melikov Reviewed-by: Ryan Moeller Reviewed-by: Alexander Motin Reviewed-by: Richard Elling Signed-off-by: Ameer Hamza Closes #13704 --- cmd/Makefile.am | 7 +- cmd/zilstat.in | 467 ++++++++++++++++++ rpm/generic/zfs.spec.in | 4 +- tests/runfiles/common.run | 3 +- tests/runfiles/sanity.run | 3 +- tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../cli_user/misc/zilstat_001_pos.ksh | 37 ++ 8 files changed, 517 insertions(+), 6 deletions(-) create mode 100755 cmd/zilstat.in create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 65de980da30..6d6de4adb42 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -100,12 +100,13 @@ endif if USING_PYTHON -bin_SCRIPTS += arc_summary arcstat dbufstat -CLEANFILES += arc_summary arcstat dbufstat -dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in +bin_SCRIPTS += arc_summary arcstat dbufstat zilstat +CLEANFILES += arc_summary arcstat dbufstat zilstat +dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in $(call SUBST,arcstat,%D%/) $(call SUBST,dbufstat,%D%/) +$(call SUBST,zilstat,%D%/) arc_summary: %D%/arc_summary $(AM_V_at)cp $< $@ endif diff --git a/cmd/zilstat.in b/cmd/zilstat.in new file mode 100755 index 00000000000..cf4e2e0dd0c --- /dev/null +++ b/cmd/zilstat.in @@ -0,0 +1,467 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out statistics for all zil stats. This information is +# available through the zil kstat. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# This script must remain compatible with Python 3.6+. +# + +import sys +import subprocess +import time +import copy +import os +import re +import signal +from collections import defaultdict +import argparse +from argparse import RawTextHelpFormatter + +cols = { + # hdr: [size, scale, kstat name] + "time": [8, -1, "time"], + "pool": [12, -1, "pool"], + "ds": [12, -1, "dataset_name"], + "obj": [12, -1, "objset"], + "zcc": [10, 1000, "zil_commit_count"], + "zcwc": [10, 1000, "zil_commit_writer_count"], + "ziic": [10, 1000, "zil_itx_indirect_count"], + "zic": [10, 1000, "zil_itx_count"], + "ziib": [10, 1024, "zil_itx_indirect_bytes"], + "zicc": [10, 1000, "zil_itx_copied_count"], + "zicb": [10, 1024, "zil_itx_copied_bytes"], + "zinc": [10, 1000, "zil_itx_needcopy_count"], + "zinb": [10, 1024, "zil_itx_needcopy_bytes"], + "zimnc": [10, 1000, "zil_itx_metaslab_normal_count"], + "zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"], + "zimsc": [10, 1000, "zil_itx_metaslab_slog_count"], + "zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"], +} + +hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \ + "zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] + +ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb", + "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] + +cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]") + +curr = {} +diff = {} +kstat = {} +ds_pairs = {} +pool_name = None +dataset_name = None +interval = 0 +sep = " " +gFlag = True +dsFlag = False + +def prettynum(sz, scale, num=0): + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + if scale == -1: + return "%*s" % (sz, num) + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while num > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if (save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + +def print_header(): + global hdr + global sep + for col in hdr: + new_col = col + if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']: + new_col += "/s" + sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep)) + sys.stdout.write("\n") + +def print_values(v): + global hdr + global sep + for col in hdr: + val = v[cols[col][2]] + if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0: + val = v[cols[col][2]] // interval + sys.stdout.write("%s%s" % ( + prettynum(cols[col][0], cols[col][1], val), sep)) + sys.stdout.write("\n") + +def print_dict(d): + for pool in d: + for objset in d[pool]: + print_values(d[pool][objset]) + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("Field definitions are as follows:\n") + for key in cols: + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + sys.exit(0) + +def init(): + global pool_name + global dataset_name + global interval + global hdr + global curr + global gFlag + global sep + + curr = dict() + + parser = argparse.ArgumentParser(description='Program to print zilstats', + add_help=True, + formatter_class=RawTextHelpFormatter, + epilog="\nUsage Examples\n"\ + "Note: Global zilstats is shown by default,"\ + " if none of a|p|d option is not provided\n"\ + "\tzilstat -a\n"\ + '\tzilstat -v\n'\ + '\tzilstat -p tank\n'\ + '\tzilstat -d tank/d1,tank/d2,tank/zv1\n'\ + '\tzilstat -i 1\n'\ + '\tzilstat -s \"***\"\n'\ + '\tzilstat -f zcwc,zimnb,zimsb\n') + + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="List field headers and definitions" + ) + + pool_grp = parser.add_mutually_exclusive_group() + + pool_grp.add_argument( + "-a", "--all", + action="store_true", + dest="all", + help="Print all dataset stats" + ) + + pool_grp.add_argument( + "-p", "--pool", + type=str, + help="Print stats for all datasets of a speicfied pool" + ) + + pool_grp.add_argument( + "-d", "--dataset", + type=str, + help="Print given dataset(s) (Comma separated)" + ) + + parser.add_argument( + "-f", "--columns", + type=str, + help="Specify specific fields to print (see -v)" + ) + + parser.add_argument( + "-s", "--separator", + type=str, + help="Override default field separator with custom " + "character or string" + ) + + parser.add_argument( + "-i", "--interval", + type=int, + dest="interval", + help="Print stats between specified interval" + " (in seconds)" + ) + + parsed_args = parser.parse_args() + + if parsed_args.verbose: + detailed_usage() + + if parsed_args.all: + gFlag = False + + if parsed_args.interval: + interval = parsed_args.interval + + if parsed_args.pool: + pool_name = parsed_args.pool + gFlag = False + + if parsed_args.dataset: + dataset_name = parsed_args.dataset + gFlag = False + + if parsed_args.separator: + sep = parsed_args.separator + + if gFlag: + hdr = ghdr + + if parsed_args.columns: + hdr = parsed_args.columns.split(",") + + invalid = [] + for ele in hdr: + if gFlag and ele not in ghdr: + invalid.append(ele) + elif ele not in cols: + invalid.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + sys.exit(1) + + if pool_name and dataset_name: + print ("Error: Can not filter both dataset and pool") + sys.exit(1) + +def FileCheck(fname): + try: + return (open(fname)) + except IOError: + print ("Unable to open zilstat proc file: " + fname) + sys.exit(1) + +if sys.platform.startswith('freebsd'): + # Requires py-sysctl on FreeBSD + import sysctl + + def kstat_update(pool = None, objid = None): + global kstat + kstat = {} + if not pool: + file = "kstat.zfs.misc.zil" + k = [ctl for ctl in sysctl.filter(file) \ + if ctl.type != sysctl.CTLTYPE_NODE] + kstat_process_str(k, file, "GLOBAL", len(file + ".")) + elif objid: + file = "kstat.zfs." + pool + ".dataset.objset-" + objid + k = [ctl for ctl in sysctl.filter(file) if ctl.type \ + != sysctl.CTLTYPE_NODE] + kstat_process_str(k, file, objid, len(file + ".")) + else: + file = "kstat.zfs." + pool + ".dataset" + zil_start = len(file + ".") + obj_start = len("kstat.zfs." + pool + ".") + k = [ctl for ctl in sysctl.filter(file) + if ctl.type != sysctl.CTLTYPE_NODE] + for s in k: + if not s or (s.name.find("zil") == -1 and \ + s.name.find("dataset_name") == -1): + continue + name, value = s.name, s.value + objid = re.findall(r'0x[0-9A-F]+', \ + name[obj_start:], re.I)[0] + if objid not in kstat: + kstat[objid] = dict() + zil_start = len(file + ".objset-" + \ + objid + ".") + kstat[objid][name[zil_start:]] = value \ + if (name.find("dataset_name")) \ + else int(value) + + def kstat_process_str(k, file, objset = "GLOBAL", zil_start = 0): + global kstat + if not k: + print("Unable to process kstat for: " + file) + sys.exit(1) + kstat[objset] = dict() + for s in k: + if not s or (s.name.find("zil") == -1 and \ + s.name.find("dataset_name") == -1): + continue + name, value = s.name, s.value + kstat[objset][name[zil_start:]] = value \ + if (name.find("dataset_name")) else int(value) + +elif sys.platform.startswith('linux'): + def kstat_update(pool = None, objid = None): + global kstat + kstat = {} + if not pool: + k = [line.strip() for line in \ + FileCheck("/proc/spl/kstat/zfs/zil")] + kstat_process_str(k, "/proc/spl/kstat/zfs/zil") + elif objid: + file = "/proc/spl/kstat/zfs/" + pool + "/objset-" + objid + k = [line.strip() for line in FileCheck(file)] + kstat_process_str(k, file, objid) + else: + if not os.path.exists(f"/proc/spl/kstat/zfs/{pool}"): + print("Pool \"" + pool + "\" does not exist, Exitting") + sys.exit(1) + objsets = os.listdir(f'/proc/spl/kstat/zfs/{pool}') + for objid in objsets: + if objid.find("objset-") == -1: + continue + file = "/proc/spl/kstat/zfs/" + pool + "/" + objid + k = [line.strip() for line in FileCheck(file)] + kstat_process_str(k, file, objid.replace("objset-", "")) + + def kstat_process_str(k, file, objset = "GLOBAL", zil_start = 0): + global kstat + if not k: + print("Unable to process kstat for: " + file) + sys.exit(1) + + kstat[objset] = dict() + for s in k: + if not s or (s.find("zil") == -1 and \ + s.find("dataset_name") == -1): + continue + name, unused, value = s.split() + kstat[objset][name] = value \ + if (name == "dataset_name") else int(value) + +def zil_process_kstat(): + global curr, pool_name, dataset_name, dsFlag, ds_pairs + curr.clear() + if gFlag == True: + kstat_update() + zil_build_dict() + else: + if pool_name: + kstat_update(pool_name) + zil_build_dict(pool_name) + elif dataset_name: + if dsFlag == False: + dsFlag = True + datasets = dataset_name.split(',') + ds_pairs = defaultdict(list) + for ds in datasets: + try: + objid = subprocess.check_output(['zfs', + 'list', '-Hpo', 'objsetid', ds], \ + stderr=subprocess.DEVNULL) \ + .decode('utf-8').strip() + except subprocess.CalledProcessError as e: + print("Command: \"zfs list -Hpo objset "\ + + str(ds) + "\" failed with error code:"\ + + str(e.returncode)) + print("Please make sure that dataset \""\ + + str(ds) + "\" exists") + sys.exit(1) + if not objid: + continue + ds_pairs[ds.split('/')[0]]. \ + append(hex(int(objid))) + for pool, objids in ds_pairs.items(): + for objid in objids: + kstat_update(pool, objid) + zil_build_dict(pool) + else: + try: + pools = subprocess.check_output(['zpool', 'list', '-Hpo',\ + 'name']).decode('utf-8').split() + except subprocess.CalledProcessError as e: + print("Command: \"zpool list -Hpo name\" failed with error"\ + "code: " + str(e.returncode)) + sys.exit(1) + for pool in pools: + kstat_update(pool) + zil_build_dict(pool) + +def calculate_diff(): + global curr, diff + prev = copy.deepcopy(curr) + zil_process_kstat() + diff = copy.deepcopy(curr) + for pool in curr: + for objset in curr[pool]: + for col in hdr: + if col not in ['time', 'pool', 'ds', 'obj']: + key = cols[col][2] + # If prev is NULL, this is the + # first time we are here + if not prev: + diff[pool][objset][key] = 0 + else: + diff[pool][objset][key] \ + = curr[pool][objset][key] \ + - prev[pool][objset][key] + +def zil_build_dict(pool = "GLOBAL"): + global kstat + for objset in kstat: + for key in kstat[objset]: + val = kstat[objset][key] + if pool not in curr: + curr[pool] = dict() + if objset not in curr[pool]: + curr[pool][objset] = dict() + curr[pool][objset][key] = val + curr[pool][objset]["pool"] = pool + curr[pool][objset]["objset"] = objset + curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \ + time.localtime()) + +def sign_handler_epipe(sig, frame): + print("Caught EPIPE signal: " + str(frame)) + print("Exitting...") + sys.exit(0) + +def main(): + global interval + global curr + hprint = False + init() + signal.signal(signal.SIGINT, signal.SIG_DFL) + signal.signal(signal.SIGPIPE, sign_handler_epipe) + + if interval > 0: + while True: + calculate_diff() + if not diff: + print ("Error: No stats to show") + sys.exit(0) + if hprint == False: + print_header() + hprint = True + print_dict(diff) + time.sleep(interval) + else: + zil_process_kstat() + if not curr: + print ("Error: No stats to show") + sys.exit(0) + print_header() + print_dict(curr) + +if __name__ == '__main__': + main() + diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index b1a94fbb7ab..aea82d24178 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -409,7 +409,8 @@ make install DESTDIR=%{?buildroot} find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; %if 0%{!?__brp_mangle_shebangs:1} find %{?buildroot}%{_bindir} \ - \( -name arc_summary -or -name arcstat -or -name dbufstat \) \ + \( -name arc_summary -or -name arcstat -or -name dbufstat \ + -or -name zilstat \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; find %{?buildroot}%{_datadir} \ \( -name test-runner.py -or -name zts-report.py \) \ @@ -487,6 +488,7 @@ systemctl --system daemon-reload >/dev/null || true %{_bindir}/arc_summary %{_bindir}/arcstat %{_bindir}/dbufstat +%{_bindir}/zilstat # Man pages %{_mandir}/man1/* %{_mandir}/man4/* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b9a9e0efcc8..e8443ffabcf 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -551,7 +551,8 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] + 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', + 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 7c466719643..f115f0b578c 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -396,7 +396,8 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] + 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', + 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 47357dca57f..4098562210b 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -169,6 +169,7 @@ export ZFS_FILES='zdb raidz_test arc_summary arcstat + zilstat dbufstat mount.zfs zed diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4a815db8a6d..b80489af255 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1230,6 +1230,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_user/misc/arcstat_001_pos.ksh \ functional/cli_user/misc/arc_summary_001_pos.ksh \ functional/cli_user/misc/arc_summary_002_neg.ksh \ + functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ functional/cli_user/misc/zdb_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh new file mode 100755 index 00000000000..9bf6a94cfc8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" + +set -A args "" "-s \",\"" "-v" \ + "-f time,zcwc,zimnb,zimsb" + +log_assert "zilstat generates output and doesn't return an error code" + +typeset -i i=0 +while [[ $i -lt ${#args[*]} ]]; do + log_must eval "zilstat ${args[i]} > /dev/null" + ((i = i + 1)) +done +log_pass "zilstat generates output and doesn't return an error code" From 4723eba8c0af10fc25d9203ffa0cd4499b4a875d Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Tue, 9 Aug 2022 09:05:29 +0000 Subject: [PATCH 08/69] FreeBSD: Mark ZFS_MODULE_PARAM_CALL as MPSAFE ZFS_MODULE_PARAM_CALL handlers implement their own locking if needed and do not require Giant. Reviewed-by: Alexander Motin Signed-off-by: Ryan Moeller Closes #13756 --- include/os/freebsd/spl/sys/mod_os.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 3a9ebbfc3bc..d64a1733ad8 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -47,7 +47,7 @@ #define ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \ SYSCTL_DECL(parent); \ - SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc) + SYSCTL_PROC(parent, OID_AUTO, name, CTLFLAG_MPSAFE | perm | args, desc) #define ZFS_MODULE_PARAM_CALL( \ scope_prefix, name_prefix, name, func, _, perm, desc) \ From 7bb707ffafbea79c5b2f9ea24959825a3c4b8802 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Tue, 9 Aug 2022 09:05:47 +0000 Subject: [PATCH 09/69] FreeBSD: Organize sysctls FreeBSD had a few platform-specific ARC tunables in the wrong place: - Move FreeBSD-specifc ARC tunables into the same vfs.zfs.arc node as the rest of the ARC tunables. - Move the handlers from arc_os.c to sysctl_os.c and add compat sysctls for the legacy names. While here, some additional clean up: - Most handlers are specific to a particular variable and don't need a pointer passed through the args. - Group blocks of related variables, handlers, and sysctl declarations into logical sections. - Match variable types for temporaries in handlers with the type of the global variable. - Remove leftover comments. Reviewed-by: Alexander Motin Signed-off-by: Ryan Moeller Closes #13756 --- include/os/freebsd/spl/sys/mod_os.h | 26 +- module/os/freebsd/zfs/arc_os.c | 31 +- module/os/freebsd/zfs/sysctl_os.c | 706 +++++++++++++++++----------- 3 files changed, 455 insertions(+), 308 deletions(-) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index d64a1733ad8..95a19cc940c 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -59,15 +59,21 @@ #define param_set_arc_long_args(var) \ CTLTYPE_ULONG, &var, 0, param_set_arc_long, "LU" -#define param_set_arc_min_args(var) \ - CTLTYPE_ULONG, &var, 0, param_set_arc_min, "LU" - -#define param_set_arc_max_args(var) \ - CTLTYPE_ULONG, &var, 0, param_set_arc_max, "LU" - #define param_set_arc_int_args(var) \ CTLTYPE_INT, &var, 0, param_set_arc_int, "I" +#define param_set_arc_min_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_arc_min, "LU" + +#define param_set_arc_max_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_arc_max, "LU" + +#define param_set_arc_free_target_args(var) \ + CTLTYPE_UINT, NULL, 0, param_set_arc_free_target, "IU" + +#define param_set_arc_no_grow_shift_args(var) \ + CTLTYPE_INT, NULL, 0, param_set_arc_no_grow_shift, "I" + #define param_set_deadman_failmode_args(var) \ CTLTYPE_STRING, NULL, 0, param_set_deadman_failmode, "A" @@ -78,16 +84,16 @@ CTLTYPE_ULONG, NULL, 0, param_set_deadman_ziotime, "LU" #define param_set_multihost_interval_args(var) \ - CTLTYPE_ULONG, &var, 0, param_set_multihost_interval, "LU" + CTLTYPE_ULONG, NULL, 0, param_set_multihost_interval, "LU" #define param_set_slop_shift_args(var) \ - CTLTYPE_INT, &var, 0, param_set_slop_shift, "I" + CTLTYPE_INT, NULL, 0, param_set_slop_shift, "I" #define param_set_min_auto_ashift_args(var) \ - CTLTYPE_U64, &var, 0, param_set_min_auto_ashift, "QU" + CTLTYPE_U64, NULL, 0, param_set_min_auto_ashift, "QU" #define param_set_max_auto_ashift_args(var) \ - CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" + CTLTYPE_U64, NULL, 0, param_set_max_auto_ashift, "QU" #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index dbd71ea43fd..b4833adedcc 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -72,31 +72,14 @@ SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ -static int -sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) -{ - uint_t val; - int err; - - val = zfs_arc_free_target; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < minfree) - return (EINVAL); - if (val > vm_cnt.v_page_count) - return (EINVAL); - - zfs_arc_free_target = val; - - return (0); -} -SYSCTL_DECL(_vfs_zfs); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, - CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t), - sysctl_vfs_zfs_arc_free_target, "IU", +int param_set_arc_free_target(SYSCTL_HANDLER_ARGS); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target, + param_set_arc_free_target, 0, CTLFLAG_RW, "Desired number of free pages below which ARC triggers reclaim"); +int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift, + param_set_arc_no_grow_shift, 0, ZMOD_RW, + "log2(fraction of ARC which must be free to allow growing)"); int64_t arc_available_memory(void) diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index c774f05ff70..cd384c205df 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -91,6 +91,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, @@ -132,170 +133,8 @@ SYSCTL_DECL(_vfs_zfs_version); SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); -extern arc_state_t ARC_anon; -extern arc_state_t ARC_mru; -extern arc_state_t ARC_mru_ghost; -extern arc_state_t ARC_mfu; -extern arc_state_t ARC_mfu_ghost; -extern arc_state_t ARC_l2c_only; - -/* - * minimum lifespan of a prefetch block in clock ticks - * (initialized in arc_init()) - */ - /* arc.c */ -int -param_set_arc_max(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_arc_max; - err = sysctl_handle_long(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (SET_ERROR(err)); - - if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min || - val >= arc_all_memory())) - return (SET_ERROR(EINVAL)); - - zfs_arc_max = val; - arc_tuning_update(B_TRUE); - - /* Update the sysctl to the tuned value */ - if (val != 0) - zfs_arc_max = arc_c_max; - - return (0); -} - -int -param_set_arc_min(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_arc_min; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (SET_ERROR(err)); - - if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max)) - return (SET_ERROR(EINVAL)); - - zfs_arc_min = val; - arc_tuning_update(B_TRUE); - - /* Update the sysctl to the tuned value */ - if (val != 0) - zfs_arc_min = arc_c_min; - - return (0); -} - -/* legacy compat */ -extern uint64_t l2arc_write_max; /* def max write size */ -extern uint64_t l2arc_write_boost; /* extra warmup write */ -extern uint64_t l2arc_headroom; /* # of dev writes */ -extern uint64_t l2arc_headroom_boost; -extern uint64_t l2arc_feed_secs; /* interval seconds */ -extern uint64_t l2arc_feed_min_ms; /* min interval msecs */ -extern int l2arc_noprefetch; /* don't cache prefetch bufs */ -extern int l2arc_feed_again; /* turbo warmup */ -extern int l2arc_norw; /* no reads during writes */ - -/* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, - &l2arc_write_max, 0, "max write size (LEGACY)"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, - &l2arc_write_boost, 0, "extra write during warmup (LEGACY)"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, - &l2arc_headroom, 0, "number of dev writes (LEGACY)"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, - &l2arc_feed_secs, 0, "interval seconds (LEGACY)"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, - &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)"); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, - &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, - &l2arc_feed_again, 0, "turbo warmup (LEGACY)"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, - &l2arc_norw, 0, "no reads during writes (LEGACY)"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of anonymous state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru ghost state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu ghost state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); -/* END CSTYLED */ - -static int -sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) -{ - int err, val; - - val = arc_no_grow_shift; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < 0 || val >= arc_shrink_shift) - return (EINVAL); - - arc_no_grow_shift = val; - return (0); -} - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, - CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int), - sysctl_vfs_zfs_arc_no_grow_shift, "I", - "log2(fraction of ARC which must be free to allow growing)"); - int param_set_arc_long(SYSCTL_HANDLER_ARGS) { @@ -324,55 +163,319 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +int +param_set_arc_max(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_arc_max; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min || + val >= arc_all_memory())) + return (SET_ERROR(EINVAL)); + + zfs_arc_max = val; + arc_tuning_update(B_TRUE); + + /* Update the sysctl to the tuned value */ + if (val != 0) + zfs_arc_max = arc_c_max; + + return (0); +} + +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, + CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 0, param_set_arc_max, "LU", + "Maximum ARC size in bytes (LEGACY)"); +/* END CSTYLED */ + +int +param_set_arc_min(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_arc_min; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max)) + return (SET_ERROR(EINVAL)); + + zfs_arc_min = val; + arc_tuning_update(B_TRUE); + + /* Update the sysctl to the tuned value */ + if (val != 0) + zfs_arc_min = arc_c_min; + + return (0); +} + /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_min, "LU", - "min arc size (LEGACY)"); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, - CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_max, "LU", - "max arc size (LEGACY)"); + NULL, 0, param_set_arc_min, "LU", + "Minimum ARC size in bytes (LEGACY)"); +/* END CSTYLED */ + +extern uint_t zfs_arc_free_target; + +static int +param_set_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + uint_t val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > vm_cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} + +/* + * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on + * pagedaemon initialization. + */ +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, 0, param_set_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim" + " (LEGACY)"); +/* END CSTYLED */ + +static int +param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = arc_no_grow_shift; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val >= arc_shrink_shift) + return (EINVAL); + + arc_no_grow_shift = val; + + return (0); +} + +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + NULL, 0, param_set_arc_no_grow_shift, "I", + "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); +/* END CSTYLED */ + +extern uint64_t l2arc_write_max; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, + CTLFLAG_RWTUN, &l2arc_write_max, 0, + "Max write bytes per interval (LEGACY)"); +/* END CSTYLED */ + +extern uint64_t l2arc_write_boost; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, + CTLFLAG_RWTUN, &l2arc_write_boost, 0, + "Extra write bytes during device warmup (LEGACY)"); +/* END CSTYLED */ + +extern uint64_t l2arc_headroom; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, + CTLFLAG_RWTUN, &l2arc_headroom, 0, + "Number of max device writes to precache (LEGACY)"); +/* END CSTYLED */ + +extern uint64_t l2arc_headroom_boost; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, + CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, + "Compressed l2arc_headroom multiplier (LEGACY)"); +/* END CSTYLED */ + +extern uint64_t l2arc_feed_secs; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, + CTLFLAG_RWTUN, &l2arc_feed_secs, 0, + "Seconds between L2ARC writing (LEGACY)"); +/* END CSTYLED */ + +extern uint64_t l2arc_feed_min_ms; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, + CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, + "Min feed interval in milliseconds (LEGACY)"); +/* END CSTYLED */ + +extern int l2arc_noprefetch; + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, + CTLFLAG_RWTUN, &l2arc_noprefetch, 0, + "Skip caching prefetched buffers (LEGACY)"); +/* END CSTYLED */ + +extern int l2arc_feed_again; + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, + CTLFLAG_RWTUN, &l2arc_feed_again, 0, + "Turbo L2ARC warmup (LEGACY)"); +/* END CSTYLED */ + +extern int l2arc_norw; + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, + CTLFLAG_RWTUN, &l2arc_norw, 0, + "No reads during writes (LEGACY)"); +/* END CSTYLED */ + +extern arc_state_t ARC_anon; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); +/* END CSTYLED */ + +extern arc_state_t ARC_mru; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); +/* END CSTYLED */ + +extern arc_state_t ARC_mru_ghost; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); +/* END CSTYLED */ + +extern arc_state_t ARC_mfu; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); +/* END CSTYLED */ + +extern arc_state_t ARC_mfu_ghost; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); +/* END CSTYLED */ + +extern arc_state_t ARC_l2c_only; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); /* END CSTYLED */ /* dbuf.c */ - /* dmu.c */ /* dmu_zfetch.c */ + SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); -/* max bytes to prefetch per stream (default 8MB) */ extern uint32_t zfetch_max_distance; -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, - &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); -/* max bytes to prefetch indirects for per stream (default 64MB) */ -extern uint32_t zfetch_max_idistance; /* BEGIN CSTYLED */ -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, - &zfetch_max_idistance, 0, +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, + CTLFLAG_RWTUN, &zfetch_max_distance, 0, + "Max bytes to prefetch per stream (LEGACY)"); +/* END CSTYLED */ + +extern uint32_t zfetch_max_idistance; + +/* BEGIN CSTYLED */ +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, + CTLFLAG_RWTUN, &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream (LEGACY)"); /* END CSTYLED */ /* dsl_pool.c */ /* dnode.c */ + extern int zfs_default_bs; + +/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, &zfs_default_bs, 0, "Default dnode block shift"); +/* END CSTYLED */ extern int zfs_default_ibs; -SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, - &zfs_default_ibs, 0, "Default dnode indirect block shift"); +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, + &zfs_default_ibs, 0, "Default dnode indirect block shift"); +/* END CSTYLED */ /* dsl_scan.c */ /* metaslab.c */ -/* BEGIN CSTYLED */ /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each @@ -382,10 +485,13 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, * is 8~16K. */ extern int zfs_metaslab_sm_blksz_no_log; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, - &zfs_metaslab_sm_blksz_no_log, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, + CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0, "Block size for space map in pools with log space map disabled. " "Power of 2 greater than 4096."); +/* END CSTYLED */ /* * When the log space map feature is enabled, we accumulate a lot of @@ -393,10 +499,13 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, * from a bigger block size like 128K for the metaslab space maps. */ extern int zfs_metaslab_sm_blksz_with_log; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, - &zfs_metaslab_sm_blksz_with_log, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, + CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0, "Block size for space map in pools with log space map enabled. " "Power of 2 greater than 4096."); +/* END CSTYLED */ /* * The in-core space map representation is more compact than its on-disk form. @@ -405,21 +514,30 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, * Values should be greater than or equal to 100. */ extern int zfs_condense_pct; -SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, - &zfs_condense_pct, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, + CTLFLAG_RWTUN, &zfs_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); +/* END CSTYLED */ extern int zfs_remove_max_segment; -SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, - &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will" - " attempt to allocate when removing a device"); + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, + CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, + "Largest contiguous segment ZFS will attempt to allocate when removing" + " a device"); +/* END CSTYLED */ extern int zfs_removal_suspend_progress; -SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, - &zfs_removal_suspend_progress, 0, - "Ensures certain actions can happen while in the middle of a removal"); +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, + CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, + "Ensures certain actions can happen while in the middle of a removal"); +/* END CSTYLED */ /* * Minimum size which forces the dynamic allocator to change @@ -428,9 +546,13 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, * aggressive strategy (i.e search by size rather than offset). */ extern uint64_t metaslab_df_alloc_threshold; -SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, - &metaslab_df_alloc_threshold, 0, "Minimum size which forces the dynamic" - " allocator to change its allocation strategy"); + +/* BEGIN CSTYLED */ +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, + CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change its" + " allocation strategy"); +/* END CSTYLED */ /* * The minimum free space, in percent, which must be available @@ -439,46 +561,84 @@ SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, * switch to using best-fit allocations. */ extern int metaslab_df_free_pct; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, - &metaslab_df_free_pct, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, + CTLFLAG_RWTUN, &metaslab_df_free_pct, 0, "The minimum free space, in percent, which must be available in a" " space map to continue allocations in a first-fit fashion"); +/* END CSTYLED */ /* * Percentage of all cpus that can be used by the metaslab taskq. */ extern int metaslab_load_pct; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, - &metaslab_load_pct, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, + CTLFLAG_RWTUN, &metaslab_load_pct, 0, "Percentage of cpus that can be used by the metaslab taskq"); +/* END CSTYLED */ /* * Max number of metaslabs per group to preload. */ extern int metaslab_preload_limit; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, - &metaslab_preload_limit, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, + CTLFLAG_RWTUN, &metaslab_preload_limit, 0, "Max number of metaslabs per group to preload"); +/* END CSTYLED */ + +/* mmp.c */ + +int +param_set_multihost_interval(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, &zfs_multihost_interval, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (spa_mode_global != SPA_MODE_UNINIT) + mmp_signal_all_threads(); + + return (0); +} /* spa.c */ + extern int zfs_ccw_retry_interval; -SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, - &zfs_ccw_retry_interval, 0, "Configuration cache file write," - " retry after failure, interval (seconds)"); + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, + CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval" + " (seconds)"); +/* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_cachefile; -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, - &zfs_max_missing_tvds_cachefile, 0, - "allow importing pools with missing top-level vdevs in cache file"); + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, + CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, + "Allow importing pools with missing top-level vdevs in cache file"); +/* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_scan; -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, - &zfs_max_missing_tvds_scan, 0, - "allow importing pools with missing top-level vdevs during scan"); + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, + CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, + "Allow importing pools with missing top-level vdevs during scan"); /* END CSTYLED */ /* spa_misc.c */ + extern int zfs_flags; + static int sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) { @@ -566,14 +726,37 @@ param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) return (-param_set_deadman_failmode_common(buf)); } +int +param_set_slop_shift(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = spa_slop_shift; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 1 || val > 31) + return (EINVAL); + + spa_slop_shift = val; + + return (0); +} /* spacemap.c */ + extern int space_map_ibs; + +/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, &space_map_ibs, 0, "Space map indirect block shift"); +/* END CSTYLED */ /* vdev.c */ + int param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) { @@ -593,6 +776,14 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) return (0); } +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), + param_set_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); +/* END CSTYLED */ + int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { @@ -613,26 +804,25 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) } /* BEGIN CSTYLED */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), - param_set_min_auto_ashift, "QU", - "Min ashift used when creating new top-level vdev. (LEGACY)"); SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), param_set_max_auto_ashift, "QU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); +/* END CSTYLED */ /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ extern int zfs_vdev_dtl_sm_blksz; -SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, - &zfs_vdev_dtl_sm_blksz, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, + CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0, "Block size for DTL space map. Power of 2 greater than 4096."); +/* END CSTYLED */ /* * vdev-wide space maps that have lots of entries written to them at @@ -640,80 +830,48 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ extern int zfs_vdev_standard_sm_blksz; -SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, - &zfs_vdev_standard_sm_blksz, 0, + +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, + CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); /* END CSTYLED */ extern int vdev_validate_skip; -SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, - &vdev_validate_skip, 0, "Enable to bypass vdev_validate()."); +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, + CTLFLAG_RDTUN, &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); +/* END CSTYLED */ /* vdev_cache.c */ /* vdev_mirror.c */ -/* - * The load configuration settings below are tuned by default for - * the case where all devices are of the same rotational type. - * - * If there is a mixture of rotating and non-rotating media, setting - * non_rotating_seek_inc to 0 may well provide better results as it - * will direct more reads to the non-rotating vdevs which are more - * likely to have a higher performance. - */ - /* vdev_queue.c */ -/* BEGIN CSTYLED */ + extern uint32_t zfs_vdev_max_active; -SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, - &zfs_vdev_max_active, 0, + +/* BEGIN CSTYLED */ +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, + CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device." " (LEGACY)"); - -extern int zfs_vdev_def_queue_depth; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, - &zfs_vdev_def_queue_depth, 0, - "Default queue depth for each allocator"); - - -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, - &zio_exclude_metadata, 0, - "Exclude metadata buffers from dumps as well"); /* END CSTYLED */ -int -param_set_slop_shift(SYSCTL_HANDLER_ARGS) -{ - int val; - int err; +extern int zfs_vdev_def_queue_depth; - val = *(int *)arg1; +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, + CTLFLAG_RWTUN, &zfs_vdev_def_queue_depth, 0, + "Default queue depth for each allocator"); +/* END CSTYLED */ - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); +/* zio.c */ - if (val < 1 || val > 31) - return (EINVAL); - - *(int *)arg1 = val; - - return (0); -} - -int -param_set_multihost_interval(SYSCTL_HANDLER_ARGS) -{ - int err; - - err = sysctl_handle_long(oidp, arg1, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (spa_mode_global != SPA_MODE_UNINIT) - mmp_signal_all_threads(); - - return (0); -} +/* BEGIN CSTYLED */ +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, + CTLFLAG_RDTUN, &zio_exclude_metadata, 0, + "Exclude metadata buffers from dumps as well"); +/* END CSTYLED */ From ee9f3bca5574192589d7c7734fdc81b361aa77db Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Fri, 2 Sep 2022 23:31:19 +0300 Subject: [PATCH 10/69] Add zfs.sync.snapshot_rename Only the single snapshot rename is provided. The recursive or more complex rename can be scripted. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Andriy Gapon Closes #13802 --- include/sys/dsl_dataset.h | 11 +++++ man/man8/zfs-program.8 | 13 ++++++ module/zfs/dsl_dataset.c | 12 +----- module/zfs/zcp_synctask.c | 37 +++++++++++++++++ tests/zfs-tests/tests/Makefile.am | 2 + .../synctask_core/tst.snapshot_rename.ksh | 41 +++++++++++++++++++ .../synctask_core/tst.snapshot_rename.zcp | 27 ++++++++++++ 7 files changed, 133 insertions(+), 10 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.zcp diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 81d25da831b..3450527af7e 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -301,6 +301,14 @@ typedef struct dsl_dataset_snapshot_arg { proc_t *ddsa_proc; } dsl_dataset_snapshot_arg_t; +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + /* * The max length of a temporary tag prefix is the number of hex digits * required to express UINT64_MAX plus one for the hyphen. @@ -473,6 +481,9 @@ void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, nvlist_t *result); +int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx); +void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx); + uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); diff --git a/man/man8/zfs-program.8 b/man/man8/zfs-program.8 index 06415b2190e..928620362be 100644 --- a/man/man8/zfs-program.8 +++ b/man/man8/zfs-program.8 @@ -424,6 +424,19 @@ To enable taking snapshots from ZCP scripts, the pool must be upgraded. .It Ar dataset Pq string Name of snapshot to create. .El +.It Fn zfs.sync.rename_snapshot dataset oldsnapname newsnapname +Rename a snapshot of a filesystem or a volume. +Returns 0 if the snapshot was successfully renamed, +and a nonzero error code otherwise. +.Pp +.Bl -tag -compact -width "newbookmark (string)" +.It Ar dataset Pq string +Name of the snapshot's parent dataset. +.It Ar oldsnapname Pq string +Original name of the snapshot. +.It Ar newsnapname Pq string +New name of the snapshot. +.El .It Fn zfs.sync.bookmark source newbookmark Create a bookmark of an existing source snapshot or bookmark. Returns 0 if the new bookmark was successfully created, diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 8f3240a5deb..44da6a3f0d4 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -2915,14 +2915,6 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) return (B_FALSE); } -typedef struct dsl_dataset_rename_snapshot_arg { - const char *ddrsa_fsname; - const char *ddrsa_oldsnapname; - const char *ddrsa_newsnapname; - boolean_t ddrsa_recursive; - dmu_tx_t *ddrsa_tx; -} dsl_dataset_rename_snapshot_arg_t; - static int dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) @@ -2953,7 +2945,7 @@ dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, return (error); } -static int +int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; @@ -3015,7 +3007,7 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, return (0); } -static void +void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c index 24210117eca..058910054d9 100644 --- a/module/zfs/zcp_synctask.c +++ b/module/zfs/zcp_synctask.c @@ -302,6 +302,42 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) return (err); } +static int zcp_synctask_rename_snapshot(lua_State *, boolean_t, nvlist_t *); +static const zcp_synctask_info_t zcp_synctask_rename_snapshot_info = { + .name = "rename_snapshot", + .func = zcp_synctask_rename_snapshot, + .pargs = { + {.za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING }, + {.za_name = "oldsnapname", .za_lua_type = LUA_TSTRING }, + {.za_name = "newsnapname", .za_lua_type = LUA_TSTRING }, + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 1 +}; + +static int +zcp_synctask_rename_snapshot(lua_State *state, boolean_t sync, + nvlist_t *err_details) +{ + (void) err_details; + int err; + const char *fsname = lua_tostring(state, 1); + const char *oldsnapname = lua_tostring(state, 2); + const char *newsnapname = lua_tostring(state, 3); + + struct dsl_dataset_rename_snapshot_arg ddrsa = { 0 }; + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = B_FALSE; + + err = zcp_sync_task(state, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, sync, NULL); + + return (err); +} + static int zcp_synctask_inherit_prop(lua_State *, boolean_t, nvlist_t *err_details); static const zcp_synctask_info_t zcp_synctask_inherit_prop_info = { @@ -529,6 +565,7 @@ zcp_load_synctask_lib(lua_State *state, boolean_t sync) &zcp_synctask_promote_info, &zcp_synctask_rollback_info, &zcp_synctask_snapshot_info, + &zcp_synctask_rename_snapshot_info, &zcp_synctask_inherit_prop_info, &zcp_synctask_bookmark_info, &zcp_synctask_set_prop_info, diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index b80489af255..89b2ca866c2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -129,6 +129,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \ functional/channel_program/synctask_core/tst.snapshot_neg.zcp \ functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \ + functional/channel_program/synctask_core/tst.snapshot_rename.zcp \ functional/channel_program/synctask_core/tst.snapshot_simple.zcp \ functional/checksum/default.cfg \ functional/clean_mirror/clean_mirror_common.kshlib \ @@ -536,6 +537,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \ functional/channel_program/synctask_core/tst.snapshot_neg.ksh \ functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \ + functional/channel_program/synctask_core/tst.snapshot_rename.ksh \ functional/channel_program/synctask_core/tst.snapshot_simple.ksh \ functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \ functional/chattr/chattr_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.ksh b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.ksh new file mode 100755 index 00000000000..0561e4b7c63 --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2022 by Andriy Gapon. All rights reserved. +# + +. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib + +# +# DESCRIPTION: Make sure basic snapshot functionality works in channel programs +# + +verify_runnable "global" + +fs=$TESTPOOL/$TESTFS/testchild +snapname1=testsnap1 +snapname2=testsnap2 + +function cleanup +{ + destroy_dataset $fs "-R" +} + +log_onexit cleanup + +log_must zfs create $fs + +log_must_program_sync $TESTPOOL \ + $ZCP_ROOT/synctask_core/tst.snapshot_rename.zcp $fs $snapname1 $snapname2 + +log_pass "Snapshot renaming works" diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.zcp new file mode 100644 index 00000000000..ef893d1551d --- /dev/null +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_rename.zcp @@ -0,0 +1,27 @@ +-- +-- This file and its contents are supplied under the terms of the +-- Common Development and Distribution License ("CDDL"), version 1.0. +-- You may only use this file in accordance with the terms of version +-- 1.0 of the CDDL. +-- +-- A full copy of the text of the CDDL should have accompanied this +-- source. A copy of the CDDL is also available via the Internet at +-- http://www.illumos.org/license/CDDL. +-- + +-- +-- Copyright (c) 2022 by Andriy Gapon. All rights reserved. +-- + +-- This program should be invoked as "zfs program " + +args = ... +argv = args["argv"] +assert(zfs.sync.snapshot(argv[1] .. "@" .. argv[2]) == 0) +assert(zfs.sync.rename_snapshot(argv[1], argv[2], argv[3]) == 0) +snaps = {} +for s in zfs.list.snapshots(argv[1]) do + table.insert(snaps, s) +end +assert(#snaps == 1) +assert(snaps[1] == (argv[1] .. "@" .. argv[3])) From 59767479acb6edb12335460c9e5f7cfd9a3823cc Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Sat, 3 Sep 2022 01:33:50 +0500 Subject: [PATCH 11/69] Add DD_FIELD string for snapshots_changed property This commit adds DD_FIELD string used in extensified dsl_dir zap object for snapshots_changed property. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Umer Saleem Closes #13819 --- include/sys/dsl_dir.h | 1 + module/zfs/dsl_dir.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 664230f146a..384f98e8f72 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -52,6 +52,7 @@ struct zthr; #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" #define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" #define DD_FIELD_LIVELIST "com.delphix:livelist" +#define DD_FIELD_SNAPSHOTS_CHANGED "com.ixsystems:snapshots_changed" typedef enum dd_used { DD_USED_HEAD, diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 7460269384b..a4db3ee2f30 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -271,7 +271,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (dsl_dir_is_zapified(dd)) { inode_timespec_t t = {0}; zap_lookup(dp->dp_meta_objset, ddobj, - zfs_prop_to_name(ZFS_PROP_SNAPSHOTS_CHANGED), + DD_FIELD_SNAPSHOTS_CHANGED, sizeof (uint64_t), sizeof (inode_timespec_t) / sizeof (uint64_t), &t); @@ -2265,7 +2265,7 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) uint64_t ddobj = dd->dd_object; dsl_dir_zapify(dd, tx); VERIFY0(zap_update(mos, ddobj, - zfs_prop_to_name(ZFS_PROP_SNAPSHOTS_CHANGED), + DD_FIELD_SNAPSHOTS_CHANGED, sizeof (uint64_t), sizeof (inode_timespec_t) / sizeof (uint64_t), &t, tx)); From 7c0e3941cdd4692d46c9171b791fa689f6bb1bfd Mon Sep 17 00:00:00 2001 From: Samuel <50765275+npc203@users.noreply.github.com> Date: Tue, 6 Sep 2022 22:07:47 +0530 Subject: [PATCH 12/69] Fix column width in 'zpool iostat -v' and 'zpool list -v' This commit fixes a minor spacing issue caused when enumerating vdev names, which originated from #13031 Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Signed-off-by: Samuel Wycliffe Closes #13811 --- cmd/zpool/zpool_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 13a51691fa7..b5b0beef532 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -5466,8 +5466,8 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data) * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ - width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_vdevs.cb_name_flags, - cb->cb_verbose); + width = get_namewidth(zhp, cb->cb_namewidth, + cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the @@ -6298,8 +6298,8 @@ get_namewidth_list(zpool_handle_t *zhp, void *data) list_cbdata_t *cb = data; int width; - width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, - cb->cb_verbose); + width = get_namewidth(zhp, cb->cb_namewidth, + cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); if (width < 9) width = 9; From 11df48ab8ba374de944cd0483c55ddaaad46b91d Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 6 Sep 2022 12:43:21 -0400 Subject: [PATCH 13/69] Cleanup Raid-Z Typo fixes Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13834 --- include/sys/vdev_raidz_impl.h | 2 +- module/zfs/vdev_raidz_math_impl.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 12f5eff22c6..c1037fa12e3 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -321,7 +321,7 @@ vdev_raidz_exp2(const uint8_t a, const unsigned exp) * Galois Field operations. * * gf_exp2 - computes 2 raised to the given power - * gf_exp2 - computes 4 raised to the given power + * gf_exp4 - computes 4 raised to the given power * gf_mul - multiplication * gf_div - division * gf_inv - multiplicative inverse diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 2d96f602314..8ba7e0cd769 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -460,8 +460,8 @@ static void raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, const size_t dsize) { - v_t *p = (v_t *)c[0]; - v_t *q = (v_t *)c[1]; + v_t *p = (v_t *)c[CODE_P]; + v_t *q = (v_t *)c[CODE_Q]; v_t *r = (v_t *)c[CODE_R]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); @@ -486,7 +486,7 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, /* - * Generate PQR parity (RAIDZ2) + * Generate PQR parity (RAIDZ3) * * @rr RAIDZ row */ From 9d0887402ba505fa7f82ffeb1e22c34fb07c83ed Mon Sep 17 00:00:00 2001 From: Rob Wing Date: Sat, 13 Aug 2022 21:09:49 -0800 Subject: [PATCH 14/69] FreeBSD: add knlist_init_sx() for exclusive locks This will be used to implement kqfilter support for zvol cdevs. Reviewed-by: Ryan Moeller Reviewed-by: Alexander Motin Signed-off-by: Rob Wing Closes #13773 --- include/os/freebsd/zfs/sys/freebsd_event.h | 37 ++++++++++++ module/Makefile.bsd | 1 + module/os/freebsd/zfs/event_os.c | 65 ++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 include/os/freebsd/zfs/sys/freebsd_event.h create mode 100644 module/os/freebsd/zfs/event_os.c diff --git a/include/os/freebsd/zfs/sys/freebsd_event.h b/include/os/freebsd/zfs/sys/freebsd_event.h new file mode 100644 index 00000000000..544ff8b0f81 --- /dev/null +++ b/include/os/freebsd/zfs/sys/freebsd_event.h @@ -0,0 +1,37 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Rob Wing + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ZFS_FREEBSD_EVENT_H +#define _ZFS_FREEBSD_EVENT_H + +#ifdef _KERNEL + +void knlist_init_sx(struct knlist *knl, struct sx *lock); + +#endif /* !_KERNEL */ + +#endif /* !_ZFS_FREEBSD_EVENT_H */ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 589ca60b29b..050b6c21e5e 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -172,6 +172,7 @@ SRCS+= abd_os.c \ arc_os.c \ crypto_os.c \ dmu_os.c \ + event_os.c \ hkdf.c \ kmod_core.c \ spa_os.c \ diff --git a/module/os/freebsd/zfs/event_os.c b/module/os/freebsd/zfs/event_os.c new file mode 100644 index 00000000000..97ac151e4fa --- /dev/null +++ b/module/os/freebsd/zfs/event_os.c @@ -0,0 +1,65 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Rob Wing + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +static void +knlist_sx_xlock(void *arg) +{ + + sx_xlock((struct sx *)arg); +} + +static void +knlist_sx_xunlock(void *arg) +{ + + sx_xunlock((struct sx *)arg); +} + +static void +knlist_sx_assert_lock(void *arg, int what) +{ + + if (what == LA_LOCKED) + sx_assert((struct sx *)arg, SX_LOCKED); + else + sx_assert((struct sx *)arg, SX_UNLOCKED); +} + +void +knlist_init_sx(struct knlist *knl, struct sx *lock) +{ + + knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock, + knlist_sx_assert_lock); +} From 983096a1b46982a86d25fda2ccdf08079c3e51b9 Mon Sep 17 00:00:00 2001 From: Rob Wing Date: Tue, 1 Feb 2022 20:00:57 -0900 Subject: [PATCH 15/69] FreeBSD: add kqfilter support for zvol cdev The only event hooked up is NOTE_ATTRIB, which is triggered when the device is resized. Reviewed-by: Ryan Moeller Reviewed-by: Alexander Motin Signed-off-by: Rob Wing Closes #13773 --- module/os/freebsd/zfs/zvol_os.c | 64 +++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index ac030f75323..8d2a6d77624 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include @@ -123,6 +124,7 @@ struct zvol_state_os { struct zvol_state_dev { struct cdev *zsd_cdev; uint64_t zsd_sync_cnt; + struct selinfo zsd_selinfo; } _zso_dev; /* volmode=geom */ @@ -167,6 +169,7 @@ static d_ioctl_t zvol_cdev_ioctl; static d_read_t zvol_cdev_read; static d_write_t zvol_cdev_write; static d_strategy_t zvol_geom_bio_strategy; +static d_kqfilter_t zvol_cdev_kqfilter; static struct cdevsw zvol_cdevsw = { .d_name = "zvol", @@ -178,6 +181,16 @@ static struct cdevsw zvol_cdevsw = { .d_read = zvol_cdev_read, .d_write = zvol_cdev_write, .d_strategy = zvol_geom_bio_strategy, + .d_kqfilter = zvol_cdev_kqfilter, +}; + +static void zvol_filter_detach(struct knote *kn); +static int zvol_filter_vnode(struct knote *kn, long hint); + +static struct filterops zvol_filterops_vnode = { + .f_isfd = 1, + .f_detach = zvol_filter_detach, + .f_event = zvol_filter_vnode, }; extern uint_t zfs_geom_probe_vdev_key; @@ -601,6 +614,49 @@ zvol_geom_bio_getattr(struct bio *bp) return (1); } +static void +zvol_filter_detach(struct knote *kn) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + + zv = kn->kn_hook; + zsd = &zv->zv_zso->zso_dev; + + knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); +} + +static int +zvol_filter_vnode(struct knote *kn, long hint) +{ + kn->kn_fflags |= kn->kn_sfflags & hint; + + return (kn->kn_fflags != 0); +} + +static int +zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + + zv = dev->si_drv2; + zsd = &zv->zv_zso->zso_dev; + + if (kn->kn_filter != EVFILT_VNODE) + return (EINVAL); + + /* XXX: extend support for other NOTE_* events */ + if (kn->kn_sfflags != NOTE_ATTRIB) + return (EINVAL); + + kn->kn_fop = &zvol_filterops_vnode; + kn->kn_hook = zv; + knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); + + return (0); +} + static void zvol_geom_bio_strategy(struct bio *bp) { @@ -1306,6 +1362,8 @@ zvol_os_free(zvol_state_t *zv) if (dev != NULL) { ASSERT3P(dev->si_drv2, ==, NULL); destroy_dev(dev); + knlist_clear(&zsd->zsd_selinfo.si_note, 0); + knlist_destroy(&zsd->zsd_selinfo.si_note); } } @@ -1409,6 +1467,8 @@ zvol_os_create_minor(const char *name) dev->si_iosize_max = MAXPHYS; #endif zsd->zsd_cdev = dev; + knlist_init_sx(&zsd->zsd_selinfo.si_note, + &zv->zv_state_lock); } } (void) strlcpy(zv->zv_name, name, MAXPATHLEN); @@ -1515,6 +1575,10 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) g_resize_provider(pp, zv->zv_volsize); g_topology_unlock(); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + + KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); } return (0); } From 238cd4b863ba5c1e1d56215d9bbd77be466f7845 Mon Sep 17 00:00:00 2001 From: Walter Huf Date: Tue, 6 Sep 2022 10:02:18 -0700 Subject: [PATCH 16/69] Add xattr_handler support for Android kernels Some ARM BSPs run the Android kernel, which has a modified xattr_handler->get() function signature. This adds support to compile against these kernels. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Walter Huf Closes #13824 --- config/kernel-xattr-handler.m4 | 29 +++++++++++++++++++- include/os/linux/kernel/linux/xattr_compat.h | 14 ++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index 00b1e74a9cc..b6cbfa15500 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -100,6 +100,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ .get = get, }; ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode_flags], [ + #include + + int get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, + size_t size, int flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ @@ -142,7 +155,21 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1, [xattr_handler->get() wants dentry]) ],[ - ZFS_LINUX_TEST_ERROR([xattr get()]) + dnl # + dnl # Android API change, + dnl # The xattr_handler->get() callback was + dnl # changed to take dentry, inode and flags. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->get() wants dentry and inode and flags]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE_FLAGS, 1, + [xattr_handler->get() wants dentry and inode and flags]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr get()]) + ]) ]) ]) ]) diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h index 21c88dd0771..9b83813db70 100644 --- a/include/os/linux/kernel/linux/xattr_compat.h +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -115,6 +115,20 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ } +/* + * Android API change, + * The xattr_handler->get() callback was changed to take a dentry and inode + * and flags, because the dentry might not be attached to an inode yet. + */ +#elif defined(HAVE_XATTR_GET_DENTRY_INODE_FLAGS) +#define ZPL_XATTR_GET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + struct inode *inode, const char *name, void *buffer, \ + size_t size, int flags) \ +{ \ + return (__ ## fn(inode, name, buffer, size)); \ +} #else #error "Unsupported kernel" #endif From 5724073517d41cf0a3cc8cc0992274a8dab601da Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 Sep 2022 02:04:15 +0200 Subject: [PATCH 17/69] make DMU_OT_IS_METADATA and DMU_OT_IS_ENCRYPTED return B_TRUE or B_FALSE Without this patch, the ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); at the beginning of dbuf_assign_arcbuf can panic if the object type is a DMU_OT_NEWTYPE that has DMU_OT_METADATA set. While we're at it, fix DMU_OT_IS_ENCRYPTED as well. Reviewed-by: Richard Yao Reviewed-by: Alexander Motin Signed-off-by: Christian Schwarz Closes #13842 --- include/sys/dmu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 5a3d7d6a505..0a4827e5ec3 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -136,7 +136,7 @@ typedef enum dmu_object_byteswap { #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_METADATA) : \ + (((ot) & DMU_OT_METADATA) != 0) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ @@ -147,7 +147,7 @@ typedef enum dmu_object_byteswap { ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_ENCRYPTED) : \ + (((ot) & DMU_OT_ENCRYPTED) != 0) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* From dff541f698d616ed9f9b1ad3afa44e450efdad7a Mon Sep 17 00:00:00 2001 From: pkubaj Date: Thu, 8 Sep 2022 17:27:25 +0000 Subject: [PATCH 18/69] Fix build on FreeBSD/powerpc64* There's no VSX handler on FreeBSD for now. Reviewed-by: Brian Behlendorf Signed-off-by: Piotr Kubaj Closes #13848 --- module/icp/algs/blake3/blake3_x86-64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c index a7552bdde4a..aecd29edb16 100644 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -74,7 +74,7 @@ static boolean_t blake3_is_sse2_supported(void) { #if defined(__x86_64) return (kfpu_allowed() && zfs_sse2_available()); -#elif defined(__PPC64__) +#elif defined(__PPC64__) && defined(__linux__) return (kfpu_allowed() && zfs_vsx_available()); #else return (kfpu_allowed()); @@ -140,7 +140,7 @@ static boolean_t blake3_is_sse41_supported(void) { #if defined(__x86_64) return (kfpu_allowed() && zfs_sse4_1_available()); -#elif defined(__PPC64__) +#elif defined(__PPC64__) && defined(__linux__) return (kfpu_allowed() && zfs_vsx_available()); #else return (kfpu_allowed()); From 380b08098edf152b1d98e4f48b9577ce44d39166 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 8 Sep 2022 13:28:20 -0400 Subject: [PATCH 19/69] Linux SPL module init: Handle memory allocation failures correctly Upon inspection of our code, I noticed that we assume that __alloc_percpu() cannot fail, and while it probably never has failed in practice, technically, it can fail, so we should handle that. Additionally, we incorrectly assume that `taskq_create()` in spl_kmem_cache_init() cannot fail. The same remark applies to it. Lastly, `spl-init()` failures should always return negative error values, but in some places, we are returning positive 1, which is incorrect. We change those values to their correct error codes. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13847 --- module/os/linux/spl/spl-generic.c | 12 ++++++++++-- module/os/linux/spl/spl-kmem-cache.c | 3 +++ module/os/linux/spl/spl-taskq.c | 6 +++--- module/os/linux/spl/spl-tsd.c | 2 +- module/os/linux/spl/spl-zlib.c | 2 +- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c index 5179100d166..de91c44257a 100644 --- a/module/os/linux/spl/spl-generic.c +++ b/module/os/linux/spl/spl-generic.c @@ -705,7 +705,7 @@ spl_kvmem_init(void) * initialize each of the per-cpu seeds so that the sequences generated on each * CPU are guaranteed to never overlap in practice. */ -static void __init +static int __init spl_random_init(void) { uint64_t s[2]; @@ -714,6 +714,9 @@ spl_random_init(void) spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t), sizeof (uint64_t)); + if (!spl_pseudo_entropy) + return (-ENOMEM); + get_random_bytes(s, sizeof (s)); if (s[0] == 0 && s[1] == 0) { @@ -737,6 +740,8 @@ spl_random_init(void) wordp[0] = s[0]; wordp[1] = s[1]; } + + return (0); } static void @@ -757,7 +762,8 @@ spl_init(void) { int rc = 0; - spl_random_init(); + if ((rc = spl_random_init())) + goto out0; if ((rc = spl_kvmem_init())) goto out1; @@ -800,6 +806,8 @@ spl_init(void) out2: spl_kvmem_fini(); out1: + spl_random_fini(); +out0: return (rc); } diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index ba4ca49a2ac..efb8d0c3033 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1452,6 +1452,9 @@ spl_kmem_cache_init(void) spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (spl_kmem_cache_taskq == NULL) + return (-ENOMEM); + return (0); } diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 0aab148975a..3b0c29606c2 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -1379,7 +1379,7 @@ spl_taskq_init(void) system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) - return (1); + return (-ENOMEM); system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); @@ -1388,7 +1388,7 @@ spl_taskq_init(void) cpuhp_remove_multi_state(spl_taskq_cpuhp_state); #endif taskq_destroy(system_taskq); - return (1); + return (-ENOMEM); } dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, @@ -1399,7 +1399,7 @@ spl_taskq_init(void) #endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); - return (1); + return (-ENOMEM); } /* diff --git a/module/os/linux/spl/spl-tsd.c b/module/os/linux/spl/spl-tsd.c index 546db9ab8bd..389c9d0d6df 100644 --- a/module/os/linux/spl/spl-tsd.c +++ b/module/os/linux/spl/spl-tsd.c @@ -706,7 +706,7 @@ spl_tsd_init(void) { tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); if (tsd_hash_table == NULL) - return (1); + return (-ENOMEM); return (0); } diff --git a/module/os/linux/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c index 589496da0c7..8c6282ee5d1 100644 --- a/module/os/linux/spl/spl-zlib.c +++ b/module/os/linux/spl/spl-zlib.c @@ -204,7 +204,7 @@ spl_zlib_init(void) size, 0, NULL, NULL, NULL, NULL, NULL, KMC_KVMEM); if (!zlib_workspace_cache) - return (1); + return (-ENOMEM); return (0); } From 320f0c6022e1c9bdc9063f849c6b2e4fa3b93995 Mon Sep 17 00:00:00 2001 From: Finix1979 Date: Fri, 9 Sep 2022 01:29:41 +0800 Subject: [PATCH 20/69] Add Linux posix_fadvise support The purpose of this PR is to accepts fadvise ioctl from userland to do read-ahead by demand. It could dramatically improve sequential read performance especially when primarycache is set to metadata or zfs_prefetch_disable is 1. If the file is mmaped, generic_fadvise is also called for page cache read-ahead besides dmu_prefetch. Only POSIX_FADV_WILLNEED and POSIX_FADV_SEQUENTIAL are supported in this PR currently. Reviewed-by: Brian Behlendorf Signed-off-by: Finix Yan Closes #13694 --- config/kernel-fadvise.m4 | 23 +++++ config/kernel-generic_fadvise.m4 | 27 ++++++ config/kernel.m4 | 4 + module/os/linux/zfs/zpl_file.c | 62 ++++++++++++ tests/runfiles/linux.run | 4 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 3 + tests/zfs-tests/cmd/file/file_fadvise.c | 97 +++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 3 + .../functional/checksum/filetest_002_pos.ksh | 2 +- .../tests/functional/fadvise/cleanup.ksh | 28 ++++++ .../functional/fadvise/fadvise_sequential.ksh | 80 +++++++++++++++ .../tests/functional/fadvise/setup.ksh | 30 ++++++ .../functional/fault/auto_spare_002_pos.ksh | 2 +- 15 files changed, 365 insertions(+), 2 deletions(-) create mode 100644 config/kernel-fadvise.m4 create mode 100644 config/kernel-generic_fadvise.m4 create mode 100644 tests/zfs-tests/cmd/file/file_fadvise.c create mode 100755 tests/zfs-tests/tests/functional/fadvise/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh create mode 100755 tests/zfs-tests/tests/functional/fadvise/setup.ksh diff --git a/config/kernel-fadvise.m4 b/config/kernel-fadvise.m4 new file mode 100644 index 00000000000..08912de16ed --- /dev/null +++ b/config/kernel-fadvise.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # Linux 4.19 API +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FADVISE], [ + ZFS_LINUX_TEST_SRC([file_fadvise], [ + #include + + static const struct file_operations + fops __attribute__ ((unused)) = { + .fadvise = NULL, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FADVISE], [ + AC_MSG_CHECKING([whether fops->fadvise() exists]) + ZFS_LINUX_TEST_RESULT([file_fadvise], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_FADVISE, 1, [fops->fadvise() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-generic_fadvise.m4 b/config/kernel-generic_fadvise.m4 new file mode 100644 index 00000000000..8d122064b22 --- /dev/null +++ b/config/kernel-generic_fadvise.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # 5.3 API change +dnl # The generic_fadvise() function is present since 4.19 kernel +dnl # but it was not exported until Linux 5.3. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FADVISE], [ + ZFS_LINUX_TEST_SRC([generic_fadvise], [ + #include + ], [ + struct file *fp __attribute__ ((unused)) = NULL; + loff_t offset __attribute__ ((unused)) = 0; + loff_t len __attribute__ ((unused)) = 0; + int advise __attribute__ ((unused)) = 0; + generic_fadvise(fp, offset, len, advise); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FADVISE], [ + AC_MSG_CHECKING([whether generic_fadvise() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_fadvise], + [generic_fadvise], [mm/fadvise.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_FADVISE, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 1f274cbe4f3..6aad2cf88e0 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -42,6 +42,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_FALLOCATE + ZFS_AC_KERNEL_SRC_FADVISE + ZFS_AC_KERNEL_SRC_GENERIC_FADVISE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_SCHED @@ -161,6 +163,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_FALLOCATE + ZFS_AC_KERNEL_FADVISE + ZFS_AC_KERNEL_GENERIC_FADVISE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_SCHED diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 43b7fb60a99..b0d9f37a3ec 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -27,6 +27,7 @@ #ifdef CONFIG_COMPAT #include #endif +#include #include #include #include @@ -37,6 +38,9 @@ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include #endif +#ifdef HAVE_FILE_FADVISE +#include +#endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include #endif @@ -906,6 +910,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg) return (copy_to_user(arg, &generation, sizeof (generation))); } +#ifdef HAVE_FILE_FADVISE +static int +zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) +{ + struct inode *ip = file_inode(filp); + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os = zfsvfs->z_os; + int error = 0; + + if (S_ISFIFO(ip->i_mode)) + return (-ESPIPE); + + if (offset < 0 || len < 0) + return (-EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + switch (advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: +#ifdef HAVE_GENERIC_FADVISE + if (zn_has_cached_data(zp)) + error = generic_fadvise(filp, offset, len, advice); +#endif + /* + * Pass on the caller's size directly, but note that + * dmu_prefetch_max will effectively cap it. If there + * really is a larger sequential access pattern, perhaps + * dmu_zfetch will detect it. + */ + if (len == 0) + len = i_size_read(ip) - offset; + + dmu_prefetch(os, zp->z_id, 0, offset, len, + ZIO_PRIORITY_ASYNC_READ); + break; + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_DONTNEED: + case POSIX_FADV_NOREUSE: + /* ignored for now */ + break; + default: + error = -EINVAL; + break; + } + + ZFS_EXIT(zfsvfs); + + return (error); +} +#endif /* HAVE_FILE_FADVISE */ + #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) @@ -1259,6 +1318,9 @@ const struct file_operations zpl_file_operations = { .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, +#ifdef HAVE_FILE_FADVISE + .fadvise = zpl_fadvise, +#endif .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 9b32e73afb1..09dfb5eb1e1 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -89,6 +89,10 @@ tags = ['functional', 'devices'] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill'] tags = ['functional', 'events'] +[tests/functional/fadvise:Linux] +tests = ['fadvise_sequential'] +tags = ['functional', 'fadvise'] + [tests/functional/fallocate:Linux] tests = ['fallocate_prealloc', 'fallocate_zero-range'] tags = ['functional', 'fallocate'] diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 20d1382532b..1fd54c1dd51 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -4,6 +4,7 @@ /devname2devid /dir_rd_update /draid +/file_fadvise /file_append /file_check /file_trunc diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 3c8faf5afbb..c19c870cf69 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -128,4 +128,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/read_dos_attributes %D%/write_dos_attribu scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file %C%_randfree_file_SOURCES = %D%/file/randfree_file.c + +scripts_zfs_tests_bin_PROGRAMS += %D%/file_fadvise +%C%_file_fadvise_SOURCES = %D%/file/file_fadvise.c endif diff --git a/tests/zfs-tests/cmd/file/file_fadvise.c b/tests/zfs-tests/cmd/file/file_fadvise.c new file mode 100644 index 00000000000..e1afb6d0a11 --- /dev/null +++ b/tests/zfs-tests/cmd/file/file_fadvise.c @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2022 by Information2 Software, Inc. All rights reserved. + */ + +#include "file_common.h" +#include +#include +#include +#include + +/* + * Call fadvise to prefetch data + */ +static const char *execname = "file_fadvise"; + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage: %s -f filename -a advise \n", execname); +} + +int +main(int argc, char *argv[]) +{ + char *filename = NULL; + int advise = 0; + int fd, ch; + int err = 0; + + while ((ch = getopt(argc, argv, "a:f:")) != EOF) { + switch (ch) { + case 'a': + advise = atoll(optarg); + break; + case 'f': + filename = optarg; + break; + case '?': + (void) printf("unknown arg %c\n", optopt); + usage(); + break; + } + } + + if (!filename) { + (void) printf("Filename not specified (-f )\n"); + err++; + } + + if (advise < POSIX_FADV_NORMAL || advise > POSIX_FADV_NOREUSE) { + (void) printf("advise is invalid\n"); + err++; + } + + if (err) { + usage(); /* no return */ + return (1); + } + + if ((fd = open(filename, O_RDWR, 0666)) < 0) { + perror("open"); + return (1); + } + + posix_fadvise(fd, 0, 0, advise); + + close(fd); + + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 4098562210b..c05b918325b 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend devname2devid dir_rd_update draid + file_fadvise file_append file_check file_trunc diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 89b2ca866c2..d53316643bc 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1370,6 +1370,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ + functional/fadvise/cleanup.ksh \ + functional/fadvise/fadvise_sequential.ksh \ + functional/fadvise/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ diff --git a/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh b/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh index a0be1c2050b..23e7aa57748 100755 --- a/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh @@ -76,7 +76,7 @@ while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL - log_mustnot eval "cat $TESTDIR/test_$type >/dev/null" + log_mustnot eval "dd if=$TESTDIR/test_$type of=/dev/null bs=$WRITESZ count=$NWRITES" cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \ awk '{print $5}') diff --git a/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh b/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh new file mode 100755 index 00000000000..8b5b43a74c1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh @@ -0,0 +1,28 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Portions Copyright (c) 2022 Information2 Software, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh new file mode 100755 index 00000000000..7b7d1d379ac --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright (c) 2022 Information2 Software, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib + +# +# DESCRIPTION: +# Test posix_fadvise. +# +# STRATEGY: +# 1. Set primarycache to metadata in order to disable prefetch +# 2. Write some data to file +# 3. get data_size field from arcstat +# 4. call file_fadvise with POSIX_FADV_SEQUENTIAL +# 5. get data_size field from arcstat again +# 6. latter data_size should be bigger than former one +# + +# NOTE: if HAVE_FILE_FADVISE is not defined former data_size +# should less or eaqul to latter one + +verify_runnable "global" + +FILE=$TESTDIR/$TESTFILE0 +BLKSZ=$(get_prop recordsize $TESTPOOL) + +function cleanup +{ + log_must zfs set primarycache=all $TESTPOOL + [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/* +} + +getstat() { + awk -v c="$1" '$1 == c {print $3; exit}' /proc/spl/kstat/zfs/arcstats +} + +log_assert "Ensure fadvise prefetch data" + +log_onexit cleanup + +log_must zfs set primarycache=metadata $TESTPOOL + +log_must file_write -o create -f $FILE -b $BLKSZ -c 1000 +sync_pool $TESTPOOL + +data_size1=$(getstat data_size) + +log_must file_fadvise -f $FILE -a 2 +sleep 10 + +data_size2=$(getstat data_size) +log_note "original data_size is $data_size1, final data_size is $data_size2" + +log_must [ $data_size1 -le $data_size2 ] + +log_pass "Ensure data could be prefetched" diff --git a/tests/zfs-tests/tests/functional/fadvise/setup.ksh b/tests/zfs-tests/tests/functional/fadvise/setup.ksh new file mode 100755 index 00000000000..8ddd73307bb --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/setup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Portions Copyright (c) 2022 Information2 Software, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup_noexit $DISK +log_pass diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index e9517bad713..bd32be9a4ff 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -73,7 +73,7 @@ for type in "mirror" "raidz" "raidz2"; do # 4. Inject CHECKSUM ERRORS on read with a zinject error handler log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL - log_must cp $TESTFILE /dev/null + log_must dd if=$TESTFILE of=/dev/null bs=1M count=64 # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" From 37f6845c6f86b1d04593e55d94318326006f4b5d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 8 Sep 2022 13:30:53 -0400 Subject: [PATCH 21/69] Improve too large physical ashift handling When iterating through children physical ashifts for vdev, prefer ones above the maximum logical ashift, that we can actually use, but within the administrator defined maximum. When selecting top-level vdev ashift, do not set it to the defined maximum in case physical ashift is even higher, but just ignore one. Using the maximum does not prevent misaligned writes, but reduces space efficiency. Since ZFS tries to write data sequentially and aggregates the writes, in many cases large misanigned writes may be not as bad as the space penalty otherwise. Allow internal physical ashifts for vdevs higher than SHIFT_MAX. May be one day allocator or aggregation could benefit from that. Reduce zfs_vdev_max_auto_ashift default from 16 (64KB) to 14 (16KB), so that ZFS may still use bigger ashifts up to SHIFT_MAX (64KB), but only if it really has to or explicitly told to, but not as an "optimization". There are some read-intensive NVMe SSDs that report Preferred Write Alignment of 64KB, and attempt to build RAIDZ2 of those leads to a space inefficiency that can't be justified. Instead these changes make ZFS fall back to logical ashift of 12 (4KB) by default and only warn user that it may be suboptimal for performance. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #13798 --- include/sys/vdev_impl.h | 1 + man/man4/zfs.4 | 5 ++- module/os/freebsd/zfs/vdev_geom.c | 3 +- module/zfs/vdev.c | 36 +++++++++++++++++-- module/zfs/vdev_draid.c | 10 ++++-- module/zfs/vdev_mirror.c | 10 ++++-- module/zfs/vdev_raidz.c | 10 ++++-- tests/zfs-tests/include/tunables.cfg | 2 ++ .../cli_root/zpool_add/add-o_ashift.ksh | 5 ++- 9 files changed, 69 insertions(+), 13 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index d22abfbc259..470eaa763d5 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -641,6 +641,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); +uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index cc55ee32ba2..cecaf7e7f0a 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -347,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev. .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq int Default limit for metaslab size. . -.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong +.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong Maximum ashift used when optimizing for logical \[->] physical sector size on new top-level vdevs. +May be increased up to +.Sy ASHIFT_MAX Po 16 Pc , +but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong Minimum ashift used when creating new top-level vdevs. diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index f3b4846f4e6..fef6a1b88e3 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -955,8 +955,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && - ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && - pp->stripeoffset == 0) + ISP2(pp->stripesize) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ea0245610fb..048616c253c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -136,7 +136,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17); */ int zfs_nocacheflush = 0; -uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; +/* + * Maximum and minimum ashift values that can be automatically set based on + * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX + * is higher than the maximum value, it is intentionally limited here to not + * excessively impact pool space efficiency. Higher ashift values may still + * be forced by vdev logical ashift or by user via ashift property, but won't + * be set automatically as a performance optimization. + */ +uint64_t zfs_vdev_max_auto_ashift = 14; uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; void @@ -1845,6 +1853,24 @@ vdev_set_deflate_ratio(vdev_t *vd) } } +/* + * Choose the best of two ashifts, preferring one between logical ashift + * (absolute minimum) and administrator defined maximum, otherwise take + * the biggest of the two. + */ +uint64_t +vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) +{ + if (a > logical && a <= zfs_vdev_max_auto_ashift) { + if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (a); + else + return (MAX(a, b)); + } else if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (MAX(a, b)); + return (b); +} + /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining @@ -1856,7 +1882,8 @@ vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); - if (vd->vdev_ashift < vd->vdev_physical_ashift) { + if (vd->vdev_ashift < vd->vdev_physical_ashift && + vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, @@ -4463,7 +4490,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; - vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_physical_ashift <= ASHIFT_MAX) + vs->vs_physical_ashift = vd->vdev_physical_ashift; + else + vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 24034d9d931..24ea5d2cbe1 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); - physical_ashift = MAX(physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + physical_ashift = vdev_best_ashift(logical_ashift, + physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 3879de68045..f9a01c9f53f 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b4daf642ed2..5a44983e551 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1527,8 +1527,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index d6a2fe5db7c..80e7bcb3bd0 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -81,7 +81,9 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress +VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift +VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 8d5ce5efa52..0166e84baa1 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2 log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk2 +logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT) typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} @@ -77,7 +79,8 @@ do log_must zpool create $TESTPOOL $disk1 log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift log_must zpool add $TESTPOOL $disk2 - log_must verify_ashift $disk2 $ashift + exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift )) + log_must verify_ashift $disk2 $exp # clean things for the next run log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift From e27e692bcc2c3e5b79f60ef16a2183f2231ff012 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 8 Sep 2022 10:32:30 -0700 Subject: [PATCH 22/69] zed: Fix config_sync autoexpand flood Users were seeing floods of `config_sync` events when autoexpand was enabled. This happened because all "disk status change" udev events invoke the autoexpand codepath, which calls zpool_relabel_disk(), which in turn cause another "disk status change" event to happen, in a feedback loop. Note that "disk status change" happens every time a user calls close() on a block device. This commit breaks the feedback loop by only allowing an autoexpand to happen if the disk actually changed size. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes: #7132 Closes: #7366 Closes #13729 --- cmd/zed/agents/zfs_mod.c | 155 +++++++++++++++++++++++++++++++++++-- cmd/zed/zed_disk_event.c | 16 ++++ include/sys/sysevent/dev.h | 3 + 3 files changed, 166 insertions(+), 8 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index d75854f2875..7364dd2c628 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -894,14 +894,90 @@ zfs_deliver_check(nvlist_t *nvl) return (0); } +/* + * Given a path to a vdev, lookup the vdev's physical size from its + * config nvlist. + * + * Returns the vdev's physical size in bytes on success, 0 on error. + */ +static uint64_t +vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path) +{ + nvlist_t *nvl = NULL; + boolean_t avail_spare, l2cache, log; + vdev_stat_t *vs = NULL; + uint_t c; + + nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); + if (!nvl) + return (0); + + verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (!vs) { + zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__, + vdev_path); + return (0); + } + + return (vs->vs_pspace); +} + +/* + * Given a path to a vdev, lookup if the vdev is a "whole disk" in the + * config nvlist. "whole disk" means that ZFS was passed a whole disk + * at pool creation time, which it partitioned up and has full control over. + * Thus a partition with wholedisk=1 set tells us that zfs created the + * partition at creation time. A partition without whole disk set would have + * been created by externally (like with fdisk) and passed to ZFS. + * + * Returns the whole disk value (either 0 or 1). + */ +static uint64_t +vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) +{ + nvlist_t *nvl = NULL; + boolean_t avail_spare, l2cache, log; + uint64_t wholedisk; + + nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); + if (!nvl) + return (0); + + verify(nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) == 0); + + return (wholedisk); +} + +/* + * If the device size grew more than 1% then return true. + */ +#define DEVICE_GREW(oldsize, newsize) \ + ((newsize > oldsize) && \ + ((newsize / (newsize - oldsize)) <= 100)) + static int zfsdle_vdev_online(zpool_handle_t *zhp, void *data) { - char *devname = data; boolean_t avail_spare, l2cache; + nvlist_t *udev_nvl = data; nvlist_t *tgt; int error; + char *tmp_devname, devname[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(devname, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH, + &tmp_devname) == 0) { + strlcpy(devname, tmp_devname, MAXPATHLEN); + zfs_append_partition(devname, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__); + } + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", devname, zpool_get_name(zhp)); @@ -953,12 +1029,75 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) vdev_state_t newstate; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { - error = zpool_vdev_online(zhp, fullpath, 0, - &newstate); - zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " - "setting device '%s' to ONLINE state " - "in pool '%s': %d", fullpath, - zpool_get_name(zhp), error); + /* + * If this disk size has not changed, then + * there's no need to do an autoexpand. To + * check we look at the disk's size in its + * config, and compare it to the disk size + * that udev is reporting. + */ + uint64_t udev_size = 0, conf_size = 0, + wholedisk = 0, udev_parent_size = 0; + + /* + * Get the size of our disk that udev is + * reporting. + */ + if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE, + &udev_size) != 0) { + udev_size = 0; + } + + /* + * Get the size of our disk's parent device + * from udev (where sda1's parent is sda). + */ + if (nvlist_lookup_uint64(udev_nvl, + DEV_PARENT_SIZE, &udev_parent_size) != 0) { + udev_parent_size = 0; + } + + conf_size = vdev_size_from_config(zhp, + fullpath); + + wholedisk = vdev_whole_disk_from_config(zhp, + fullpath); + + /* + * Only attempt an autoexpand if the vdev size + * changed. There are two different cases + * to consider. + * + * 1. wholedisk=1 + * If you do a 'zpool create' on a whole disk + * (like /dev/sda), then zfs will create + * partitions on the disk (like /dev/sda1). In + * that case, wholedisk=1 will be set in the + * partition's nvlist config. So zed will need + * to see if your parent device (/dev/sda) + * expanded in size, and if so, then attempt + * the autoexpand. + * + * 2. wholedisk=0 + * If you do a 'zpool create' on an existing + * partition, or a device that doesn't allow + * partitions, then wholedisk=0, and you will + * simply need to check if the device itself + * expanded in size. + */ + if (DEVICE_GREW(conf_size, udev_size) || + (wholedisk && DEVICE_GREW(conf_size, + udev_parent_size))) { + error = zpool_vdev_online(zhp, fullpath, + 0, &newstate); + + zed_log_msg(LOG_INFO, + "%s: autoexpanding '%s' from %llu" + " to %llu bytes in pool '%s': %d", + __func__, fullpath, conf_size, + MAX(udev_size, udev_parent_size), + zpool_get_name(zhp), error); + } } } zpool_close(zhp); @@ -989,7 +1128,7 @@ zfs_deliver_dle(nvlist_t *nvl) zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); } - if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " "found", name); return (1); diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 8845c5b2d00..3c8e2fb38c1 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -78,6 +78,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval); if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) @@ -130,6 +132,20 @@ dev_event_nvlist(struct udev_device *dev) numval *= strtoull(value, NULL, 10); (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + + /* + * If the device has a parent, then get the parent block + * device's size as well. For example, /dev/sda1's parent + * is /dev/sda. + */ + struct udev_device *parent_dev = udev_device_get_parent(dev); + if ((value = udev_device_get_sysattr_value(parent_dev, "size")) + != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval); + } } /* diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h index da6539b4a0d..0783d007316 100644 --- a/include/sys/sysevent/dev.h +++ b/include/sys/sysevent/dev.h @@ -244,6 +244,9 @@ extern "C" { #define DEV_PATH "path" #define DEV_IS_PART "is_slice" #define DEV_SIZE "dev_size" + +/* Size of the whole parent block device (if dev is a partition) */ +#define DEV_PARENT_SIZE "dev_parent_size" #endif /* __linux__ */ #define EV_V1 1 From 60d995727a19104a2832d475f5c0861ffbae2c97 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 8 Sep 2022 13:40:18 -0400 Subject: [PATCH 23/69] FreeBSD: Replace legacy make_dev() interface usage The function make_dev_s() was introduced to replace make_dev() in FreeBSD 11.0. It allows further specification of properties and flags and returns an error code on failure. Using this we can fail loading the module more gracefully than a panic in situations such as when a device named zfs already exists. We already use it for zvols. Use make_dev_s() for /dev/zfs. Reviewed-by: Alexander Motin Signed-off-by: Ryan Moeller Closes #13854 --- module/os/freebsd/zfs/kmod_core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index 2b808357ecc..020ef6a39b5 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -219,9 +219,16 @@ static struct cdevsw zfs_cdevsw = { int zfsdev_attach(void) { - zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, - ZFS_DRIVER); - return (0); + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zfs_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0666; + return (make_dev_s(&args, &zfsdev, ZFS_DRIVER)); } void From ede037cda73675f42b1452187e8dd3438fafc220 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Fri, 9 Sep 2022 11:54:16 -0600 Subject: [PATCH 24/69] Make zfs-share service resilient to stale exports The are a few cases where stale entries in /etc/exports.d/zfs.exports will cause the nfs-server service to fail when starting up. Since the nfs-server startup consumes /etc/exports.d/zfs.exports, the zfs-share service (which rebuilds the list of zfs exports) should run before the nfs-server service. To make the zfs-share service resilient to stale exports, this change truncates the zfs config file as part of the zfs share -a operation. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Don Brady Closes #13775 --- cmd/zfs/zfs_main.c | 3 ++ etc/systemd/system/zfs-share.service.in | 2 +- include/libzfs.h | 3 +- lib/libshare/libshare.c | 12 +++++- lib/libshare/libshare_impl.h | 3 +- lib/libshare/nfs.c | 12 ++++++ lib/libshare/nfs.h | 2 + lib/libshare/os/freebsd/nfs.c | 9 +++- lib/libshare/os/linux/nfs.c | 9 +++- lib/libspl/include/libshare.h | 3 +- lib/libzfs/libzfs.abi | 55 ++++++++++++++++++------- lib/libzfs/libzfs_mount.c | 12 +++++- 12 files changed, 101 insertions(+), 24 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index f1d686753c2..008f1bea0ec 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -7093,6 +7093,9 @@ share_mount(int op, int argc, char **argv) share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); + /* For a 'zfs share -a' operation start with a clean slate. */ + zfs_truncate_shares(NULL); + /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. Additionally, the key-loading option must diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 263055e5281..1a6342a06fe 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -1,7 +1,7 @@ [Unit] Description=ZFS file system shares Documentation=man:zfs(8) -After=nfs-server.service nfs-kernel-server.service +Before=nfs-server.service nfs-kernel-server.service After=smb.service Before=rpc-statd-notify.service Wants=zfs-mount.service diff --git a/include/libzfs.h b/include/libzfs.h index 96cf1e18652..92c7bf6d1c9 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. @@ -895,6 +895,7 @@ _LIBZFS_H int zfs_unshare(zfs_handle_t *zhp, const char *mountpoint, _LIBZFS_H int zfs_unshareall(zfs_handle_t *zhp, const enum sa_protocol *proto); _LIBZFS_H void zfs_commit_shares(const enum sa_protocol *proto); +_LIBZFS_H void zfs_truncate_shares(const enum sa_protocol *proto); _LIBZFS_H int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); diff --git a/lib/libshare/libshare.c b/lib/libshare/libshare.c index d6257aa1ef3..d50b4550d6d 100644 --- a/lib/libshare/libshare.c +++ b/lib/libshare/libshare.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner - * Copyright (c) 2018, 2020 by Delphix. All rights reserved. + * Copyright (c) 2018, 2022 by Delphix. All rights reserved. */ #include @@ -96,6 +96,16 @@ sa_commit_shares(enum sa_protocol protocol) fstypes[protocol]->commit_shares(); } +void +sa_truncate_shares(enum sa_protocol protocol) +{ + /* CSTYLED */ + VALIDATE_PROTOCOL(protocol, ); + + if (fstypes[protocol]->truncate_shares != NULL) + fstypes[protocol]->truncate_shares(); +} + int sa_validate_shareopts(const char *options, enum sa_protocol protocol) { diff --git a/lib/libshare/libshare_impl.h b/lib/libshare/libshare_impl.h index b845eb2d8ac..d8c924757fe 100644 --- a/lib/libshare/libshare_impl.h +++ b/lib/libshare/libshare_impl.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner - * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + * Copyright (c) 2019, 2022 by Delphix. All rights reserved. */ #ifndef _LIBSPL_LIBSHARE_IMPL_H #define _LIBSPL_LIBSHARE_IMPL_H @@ -39,6 +39,7 @@ typedef struct { boolean_t (*const is_shared)(sa_share_impl_t share); int (*const validate_shareopts)(const char *shareopts); int (*const commit_shares)(void); + void (*const truncate_shares)(void); } sa_fstype_t; extern const sa_fstype_t libshare_nfs_type, libshare_smb_type; diff --git a/lib/libshare/nfs.c b/lib/libshare/nfs.c index 161bbfb0ceb..bbaea93fca5 100644 --- a/lib/libshare/nfs.c +++ b/lib/libshare/nfs.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "nfs.h" @@ -281,6 +282,17 @@ nfs_toggle_share(const char *lockfile, const char *exports, return (error); } +void +nfs_reset_shares(const char *lockfile, const char *exports) +{ + int nfs_lock_fd = -1; + + if (nfs_exports_lock(lockfile, &nfs_lock_fd) == 0) { + (void) ! truncate(exports, 0); + nfs_exports_unlock(lockfile, &nfs_lock_fd); + } +} + static boolean_t nfs_is_shared_cb(void *userdata, char *line, boolean_t found_mountpoint) { diff --git a/lib/libshare/nfs.h b/lib/libshare/nfs.h index 58523c8f02e..f4340b18f89 100644 --- a/lib/libshare/nfs.h +++ b/lib/libshare/nfs.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2022 by Delphix. All rights reserved. */ #include "libshare_impl.h" @@ -33,3 +34,4 @@ boolean_t nfs_is_shared_impl(const char *exports, sa_share_impl_t impl_share); int nfs_toggle_share(const char *lockfile, const char *exports, const char *expdir, sa_share_impl_t impl_share, int(*cbk)(sa_share_impl_t impl_share, FILE *tmpfile)); +void nfs_reset_shares(const char *lockfile, const char *exports); diff --git a/lib/libshare/os/freebsd/nfs.c b/lib/libshare/os/freebsd/nfs.c index 78977a25f4f..521631c51f0 100644 --- a/lib/libshare/os/freebsd/nfs.c +++ b/lib/libshare/os/freebsd/nfs.c @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * Copyright (c) 2020 by Delphix. All rights reserved. + * Copyright (c) 2020, 2022 by Delphix. All rights reserved. */ #include @@ -195,6 +195,12 @@ nfs_commit_shares(void) return (SA_OK); } +static void +nfs_truncate_shares(void) +{ + nfs_reset_shares(ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE); +} + const sa_fstype_t libshare_nfs_type = { .enable_share = nfs_enable_share, .disable_share = nfs_disable_share, @@ -202,4 +208,5 @@ const sa_fstype_t libshare_nfs_type = { .validate_shareopts = nfs_validate_shareopts, .commit_shares = nfs_commit_shares, + .truncate_shares = nfs_truncate_shares, }; diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c index 0870f37e581..0d63c989d34 100644 --- a/lib/libshare/os/linux/nfs.c +++ b/lib/libshare/os/linux/nfs.c @@ -23,7 +23,7 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner * Copyright (c) 2012 Cyril Plisko. All rights reserved. - * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + * Copyright (c) 2019, 2022 by Delphix. All rights reserved. */ #include @@ -495,6 +495,12 @@ nfs_commit_shares(void) return (libzfs_run_process(argv[0], argv, 0)); } +static void +nfs_truncate_shares(void) +{ + nfs_reset_shares(ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE); +} + const sa_fstype_t libshare_nfs_type = { .enable_share = nfs_enable_share, .disable_share = nfs_disable_share, @@ -502,6 +508,7 @@ const sa_fstype_t libshare_nfs_type = { .validate_shareopts = nfs_validate_shareopts, .commit_shares = nfs_commit_shares, + .truncate_shares = nfs_truncate_shares, }; static boolean_t diff --git a/lib/libspl/include/libshare.h b/lib/libspl/include/libshare.h index d976f096ac3..deeb15c9770 100644 --- a/lib/libspl/include/libshare.h +++ b/lib/libspl/include/libshare.h @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + * Copyright (c) 2019, 2022 by Delphix. All rights reserved. */ #ifndef _LIBSPL_LIBSHARE_H #define _LIBSPL_LIBSHARE_H extern __attribute__((visibility("default"))) @@ -88,6 +88,7 @@ _LIBSPL_LIBSHARE_H int sa_enable_share(const char *, const char *, const char *, _LIBSPL_LIBSHARE_H int sa_disable_share(const char *, enum sa_protocol); _LIBSPL_LIBSHARE_H boolean_t sa_is_shared(const char *, enum sa_protocol); _LIBSPL_LIBSHARE_H void sa_commit_shares(enum sa_protocol); +_LIBSPL_LIBSHARE_H void sa_truncate_shares(enum sa_protocol); /* protocol specific interfaces */ _LIBSPL_LIBSHARE_H int sa_validate_shareopts(const char *, enum sa_protocol); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 0494aec208e..7dd12df8171 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -245,6 +245,7 @@ + @@ -428,6 +429,7 @@ + @@ -758,6 +760,10 @@ + + + + @@ -787,7 +793,7 @@ - + @@ -803,6 +809,9 @@ + + + @@ -816,6 +825,8 @@ + + @@ -832,6 +843,9 @@ + + + @@ -2302,6 +2316,7 @@ + @@ -3034,6 +3049,10 @@ + + + + @@ -3150,6 +3169,7 @@ + @@ -3750,7 +3770,7 @@ - + @@ -3790,6 +3810,9 @@ + + + @@ -3903,16 +3926,17 @@ + - + - + @@ -4032,8 +4056,8 @@ - - + + @@ -4102,15 +4126,15 @@ - + - - + + @@ -4123,7 +4147,7 @@ - + @@ -4771,8 +4795,8 @@ - - + + @@ -4872,7 +4896,7 @@ - + @@ -4935,7 +4959,7 @@ - + @@ -5013,7 +5037,7 @@ - + @@ -5455,7 +5479,6 @@ - diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index fdfdd8d2808..44f7d698c82 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2021 by Delphix. All rights reserved. + * Copyright (c) 2014, 2022 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright 2017 RackTop Systems. * Copyright (c) 2018 Datto Inc. @@ -788,6 +788,16 @@ zfs_commit_shares(const enum sa_protocol *proto) sa_commit_shares(*p); } +void +zfs_truncate_shares(const enum sa_protocol *proto) +{ + if (proto == NULL) + proto = share_all_proto; + + for (const enum sa_protocol *p = proto; *p != SA_NO_PROTOCOL; ++p) + sa_truncate_shares(*p); +} + /* * Unshare the given filesystem. */ From 0e4c830bc19766e860e760e10e0d59250f12cced Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 12:55:37 -0400 Subject: [PATCH 25/69] Cleanup: Use OpenSolaris functions to call scheduler In our codebase, `cond_resched() and `schedule()` are Linux kernel functions that have replaced the OpenSolaris `kpreempt()` functions in the codebase to such an extent that `kpreempt()` in zfs_context.h was broken. Nobody noticed because we did not actually use it. The header had defined `kpreempt()` as `yield()`, which works on OpenSolaris and Illumos where `sched_yield()` is a wrapper for `yield()`, but that does not work on any other platform. The FreeBSD platform specific code implemented shims for these, but the shim for `schedule()` forced us to wait, which is different than merely rescheduling to another thread as the original Linux code does, while the shim for `cond_resched()` had the same definition as its kernel kpreempt() shim. After studying this, I have concluded that we should reintroduce the kpreempt() function in platform independent code with the following definitions: - In the Linux kernel: kpreempt(unused) -> cond_resched() - In the FreeBSD kernel: kpreempt(unused) -> kern_yield(PRI_USER) - In userspace: kpreempt(unused) -> sched_yield() In userspace, nothing changes from this cleanup. In the kernels, the function `fm_fini()` will now call `kern_yield(PRI_USER)` on FreeBSD and `cond_resched()` on Linux. This is instead of `pause("schedule", 1)` on FreeBSD and `schedule()` on Linux. This makes our behavior consistent across platforms. Note that Linux's SPL continues to use `cond_resched()` and `schedule()`. However, those functions have been removed from both the FreeBSD code and userspace code. This should have the benefit of making it slightly easier to port the code to new platforms by making how things should be mapped less confusing. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13845 --- include/os/freebsd/spl/sys/disp.h | 2 ++ include/os/freebsd/spl/sys/timer.h | 2 -- include/os/freebsd/zfs/sys/zfs_context_os.h | 2 -- include/os/linux/spl/sys/disp.h | 4 +++- include/sys/zfs_context.h | 5 +++-- module/zfs/arc.c | 4 ++-- module/zfs/dnode.c | 6 +++--- module/zfs/fm.c | 2 +- module/zfs/spa_log_spacemap.c | 2 +- 9 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h index 2be1b76e433..d46a7d2c014 100644 --- a/include/os/freebsd/spl/sys/disp.h +++ b/include/os/freebsd/spl/sys/disp.h @@ -31,6 +31,8 @@ #include +#define KPREEMPT_SYNC (-1) + #define kpreempt(x) kern_yield(PRI_USER) #endif /* _OPENSOLARIS_SYS_DISP_H_ */ diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h index d4694bb7c09..7ff77e9b1b7 100644 --- a/include/os/freebsd/spl/sys/timer.h +++ b/include/os/freebsd/spl/sys/timer.h @@ -33,6 +33,4 @@ #define usleep_range(wakeup, wakeupepsilon) \ pause_sbt("usleep_range", ustosbt(wakeup), \ ustosbt(wakeupepsilon - wakeup), 0) - -#define schedule() pause("schedule", 1) #endif diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h index 86719950139..1ce72330412 100644 --- a/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -45,8 +45,6 @@ #define HAVE_LARGE_STACKS 1 #endif -#define cond_resched() kern_yield(PRI_USER) - #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ ((void) sizeof (dc), taskq_create(a, b, maxclsyspri, d, e, f)) diff --git a/include/os/linux/spl/sys/disp.h b/include/os/linux/spl/sys/disp.h index e106d3c5438..c8be6ffbf10 100644 --- a/include/os/linux/spl/sys/disp.h +++ b/include/os/linux/spl/sys/disp.h @@ -26,7 +26,9 @@ #include -#define kpreempt(unused) schedule() +#define KPREEMPT_SYNC (-1) + +#define kpreempt(unused) cond_resched() #define kpreempt_disable() preempt_disable() #define kpreempt_enable() preempt_enable() diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index aa4f7878963..83ed97fbec7 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -219,7 +219,6 @@ typedef pthread_t kthread_t; #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) -#define kpreempt(x) yield() #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ @@ -248,9 +247,11 @@ extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) +#define KPREEMPT_SYNC (-1) + +#define kpreempt(x) sched_yield() #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) -#define cond_resched() sched_yield() /* * Mutexes diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 980dc60d0cc..b9969bff534 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4165,7 +4165,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * this CPU are able to make progress, make a voluntary preemption * call here. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } @@ -10335,7 +10335,7 @@ l2arc_rebuild(l2arc_dev_t *dev) !dev->l2ad_first) goto out; - cond_resched(); + kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 67fe1e2c9a0..ef27dfd40af 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1142,7 +1142,7 @@ dnode_free_interior_slots(dnode_t *dn) while (!dnode_slots_tryenter(children, idx, slots)) { DNODE_STAT_BUMP(dnode_free_interior_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } dnode_set_slots(children, idx, slots, DN_SLOT_FREE); @@ -1423,7 +1423,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } /* @@ -1478,7 +1478,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } if (!dnode_check_slots_free(dnc, idx, slots)) { diff --git a/module/zfs/fm.c b/module/zfs/fm.c index e7a7ad58324..bc13b5517c4 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -1354,7 +1354,7 @@ fm_fini(void) zevent_flags |= ZEVENT_SHUTDOWN; while (zevent_waiters > 0) { mutex_exit(&zevent_lock); - schedule(); + kpreempt(KPREEMPT_SYNC); mutex_enter(&zevent_lock); } mutex_exit(&zevent_lock); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 19e334916bd..4ecce8214f6 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1176,7 +1176,7 @@ spa_ld_log_sm_data(spa_t *spa) } /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); ASSERT0(sls->sls_nblocks); sls->sls_nblocks = space_map_nblocks(sls->sls_sm); spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; From 13f2b8fb92c23090b9f6e701c8471aef6b8e917b Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 14:22:15 -0400 Subject: [PATCH 26/69] Fix use-after-free in btree code Coverty static analysis found these. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #10989 Closes #13861 --- module/zfs/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 14cab4054cb..60b063ed907 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -1608,8 +1608,8 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); new_rm_hdr->bth_count = 0; - zfs_btree_node_destroy(tree, new_rm_hdr); zfs_btree_remove_from_node(tree, parent, new_rm_hdr); + zfs_btree_node_destroy(tree, new_rm_hdr); } /* Remove the element at the specific location. */ @@ -1817,10 +1817,10 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) /* Move our elements to the left neighbor. */ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); - zfs_btree_node_destroy(tree, rm_hdr); /* Remove the emptied node from the parent. */ zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_node_destroy(tree, rm_hdr); zfs_btree_verify(tree); } From 7195c04d986ecd26c25c13e3c180790a2e85a723 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 15:34:10 -0400 Subject: [PATCH 27/69] Fix file descriptor handling in zdb_copy_object() Coverity found a file descriptor leak. Eyeballing it showed that we had no handling for the `open()` call failing either. We can address both of these at once. Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13862 --- cmd/zdb/zdb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5389520e803..0fc4f0d0d1b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4737,6 +4737,8 @@ zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) } int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd == -1) + return (errno); /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the @@ -4746,6 +4748,7 @@ zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) offset = 0; char *buf = kmem_alloc(oursize, KM_NOSLEEP); if (buf == NULL) { + (void) close(fd); return (ENOMEM); } @@ -4755,6 +4758,7 @@ zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(buf, oursize); + (void) close(fd); return (err); } if (dump_opt['v'] > 3) { From e5327e7f9790ed7e884a7f8d9fa412632506b826 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 15:51:17 -0400 Subject: [PATCH 28/69] vdev_draid_lookup_map() should not iterate outside draid_maps Coverity reported this as an out-of-bounds read. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13865 --- module/zfs/vdev_draid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 24ea5d2cbe1..032e8825a29 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) int vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) { - for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) { if (draid_maps[i].dm_children == children) { *mapp = &draid_maps[i]; return (0); From 710fd1ded68491a164d85aedc69ffd4675ec5c59 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 15:54:43 -0400 Subject: [PATCH 29/69] zpool_load_compat() should create strings of length ZFS_MAXPROPLEN Otherwise, `strlcat()` can overflow them. Coverity found this. Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13866 --- lib/libzfs/libzfs_pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 928f8b4287b..eea388cf348 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -4684,8 +4684,8 @@ zpool_load_compat(const char *compat, boolean_t *features, char *report, for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; - char err_badfile[1024] = ""; - char err_badtoken[1024] = ""; + char err_badfile[ZFS_MAXPROPLEN] = ""; + char err_badtoken[ZFS_MAXPROPLEN] = ""; /* * We ignore errors from the directory open() From d5d10f2aef98e86d4873c435cdbd9b3ced447caf Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 13 Sep 2022 19:40:10 -0400 Subject: [PATCH 30/69] Cleanup dead spa_boot code Unused code detected by coverity. Reviewed-by: Allan Jude Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13868 --- include/Makefile.am | 1 - include/sys/spa_boot.h | 42 ------------------------- lib/libzpool/Makefile.am | 1 - module/Kbuild.in | 1 - module/Makefile.bsd | 1 - module/os/freebsd/zfs/spa_os.c | 1 - module/os/freebsd/zfs/zfs_vfsops.c | 1 - module/os/linux/zfs/zfs_vfsops.c | 1 - module/zfs/spa.c | 1 - module/zfs/spa_boot.c | 50 ------------------------------ 10 files changed, 100 deletions(-) delete mode 100644 include/sys/spa_boot.h delete mode 100644 module/zfs/spa_boot.c diff --git a/include/Makefile.am b/include/Makefile.am index 1a7f67e9c44..19726bba186 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -76,7 +76,6 @@ COMMON_H = \ sys/sa_impl.h \ sys/skein.h \ sys/spa.h \ - sys/spa_boot.h \ sys/spa_checkpoint.h \ sys/spa_checksum.h \ sys/spa_impl.h \ diff --git a/include/sys/spa_boot.h b/include/sys/spa_boot.h deleted file mode 100644 index 4a69efdda94..00000000000 --- a/include/sys/spa_boot.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_SPA_BOOT_H -#define _SYS_SPA_BOOT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern char *spa_get_bootprop(char *prop); -extern void spa_free_bootprop(char *prop); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_BOOT_H */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index eaa920e5610..0cc1997f7a9 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -121,7 +121,6 @@ nodist_libzpool_la_SOURCES = \ module/zfs/sha256.c \ module/zfs/skein_zfs.c \ module/zfs/spa.c \ - module/zfs/spa_boot.c \ module/zfs/spa_checkpoint.c \ module/zfs/spa_config.c \ module/zfs/spa_errlog.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 4803952cbfe..7a20e6ee461 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -345,7 +345,6 @@ ZFS_OBJS := \ sha256.o \ skein_zfs.o \ spa.o \ - spa_boot.o \ spa_checkpoint.o \ spa_config.o \ spa_errlog.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 050b6c21e5e..8829ad94213 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -271,7 +271,6 @@ SRCS+= abd.c \ sha256.c \ skein_zfs.c \ spa.c \ - spa_boot.c \ spa_checkpoint.c \ spa_config.c \ spa_errlog.c \ diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c index 251fafcc964..9bc61a6c8fe 100644 --- a/module/os/freebsd/zfs/spa_os.c +++ b/module/os/freebsd/zfs/spa_os.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include #include diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 4e4a5f8d215..8b60b34d85c 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -62,7 +62,6 @@ #include #include #include -#include #include #include #include diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index eac3dcb6a55..d0575fe5e98 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b2b59af4294..eeec3b6be9c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -81,7 +81,6 @@ #include #include #include -#include #include #include #include diff --git a/module/zfs/spa_boot.c b/module/zfs/spa_boot.c deleted file mode 100644 index fddb5c3c968..00000000000 --- a/module/zfs/spa_boot.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifdef _KERNEL - -#include -#include -#include - -char * -spa_get_bootprop(char *propname) -{ - char *value; - - if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), - DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS) - return (NULL); - return (value); -} - -void -spa_free_bootprop(char *value) -{ - ddi_prop_free(value); -} - -#endif /* _KERNEL */ From 8fdc229a9cb6c7f5ba6cd8dc3b40a3c1355f66c5 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 13 Sep 2022 19:53:21 -0400 Subject: [PATCH 31/69] Fix memory leak in ztest Coverity found this. Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13863 --- cmd/ztest.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/ztest.c b/cmd/ztest.c index 31b9990a1fc..847c3a5b06c 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -7966,6 +7966,7 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) VERIFY3S(-1, !=, asprintf(&newlp, "%s:%s", libpath, curlp)); VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); + free(newlp); } } (void) execl(cmd, cmd, (char *)NULL); From cf66e7e594fc7063db8050f2b7c718ae3f94641b Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 13 Sep 2022 19:59:33 -0400 Subject: [PATCH 32/69] Cleanup: Make memory barrier definitions consistent across kernels We inherited membar_consumer() and membar_producer() from OpenSolaris, but we had replaced membar_consumer() with Linux's smp_rmb() in zfs_ioctl.c. The FreeBSD SPL consequently implemented a shim for the Linux-only smp_rmb(). We reinstate membar_consumer() in platform independent code and fix the FreeBSD SPL to implement membar_consumer() in a way analogous to Linux. Reviewed-by: Konstantin Belousov Reviewed-by: Mateusz Guzik Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Reviewed-by: Alexander Motin Signed-off-by: Richard Yao Closes #13843 --- include/os/freebsd/linux/compiler.h | 1 - include/os/freebsd/spl/sys/atomic.h | 3 ++- include/os/linux/spl/sys/vmsystm.h | 2 ++ module/zfs/zfs_ioctl.c | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h index 3a66da19589..b408b77c746 100644 --- a/include/os/freebsd/linux/compiler.h +++ b/include/os/freebsd/linux/compiler.h @@ -83,7 +83,6 @@ #define __printf(a, b) __printflike(a, b) #define barrier() __asm__ __volatile__("": : :"memory") -#define smp_rmb() rmb() #define ___PASTE(a, b) a##b #define __PASTE(a, b) ___PASTE(a, b) diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h index 1a68bfc4de2..01b13fc9afd 100644 --- a/include/os/freebsd/spl/sys/atomic.h +++ b/include/os/freebsd/spl/sys/atomic.h @@ -57,7 +57,8 @@ extern uint64_t atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval); #endif -#define membar_producer atomic_thread_fence_rel +#define membar_consumer() atomic_thread_fence_acq() +#define membar_producer() atomic_thread_fence_rel() static __inline uint32_t atomic_add_32_nv(volatile uint32_t *target, int32_t delta) diff --git a/include/os/linux/spl/sys/vmsystm.h b/include/os/linux/spl/sys/vmsystm.h index b3f121ecf0c..fcd61e818fa 100644 --- a/include/os/linux/spl/sys/vmsystm.h +++ b/include/os/linux/spl/sys/vmsystm.h @@ -44,7 +44,9 @@ #define zfs_totalhigh_pages totalhigh_pages #endif +#define membar_consumer() smp_rmb() #define membar_producer() smp_wmb() + #define physmem zfs_totalram_pages #define xcopyin(from, to, size) copy_from_user(to, from, size) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 382975208b9..6b9b43271ba 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7482,7 +7482,7 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { - smp_rmb(); + membar_consumer(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); From fcd7293d4e7852a99c5c57443d6799895e10bc9f Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 13 Sep 2022 20:00:53 -0400 Subject: [PATCH 33/69] Remove incorrect free() in zfs_get_pci_slots_sys_path() Coverity found this. We attempted to free tmp, which is a pointer to a string that should be freed by the caller. Reviewed-by: Neal Gompa Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13864 --- lib/libzutil/os/linux/zutil_device_path_os.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c index f081ef53da7..9f4c74f50f3 100644 --- a/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -273,7 +273,6 @@ zfs_get_pci_slots_sys_path(const char *dev_name) free(address2); if (asprintf(&path, "/sys/bus/pci/slots/%s", ep->d_name) == -1) { - free(tmp); continue; } break; From d954ca19ba8b0c505e88a74a9681c4c81e7cfc57 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 13 Sep 2022 20:58:29 -0400 Subject: [PATCH 34/69] Fix theoretical "use-after-free" in dbuf_prefetch_indirect_done() Coverity complains about a "use-after-free" bug in `dbuf_prefetch_indirect_done()` because we use a pointer value after freeing its buffer. The pointer is used for refcounting in ARC (as the reference holder). There is a theoretical situation where the pointer would be reused in a way that causes the refcounting to collide, so we change the order in which we call arc_buf_destroy() and dbuf_prefetch_fini() to match the rest of the function. This prevents the theoretical situation from being a possibility. Also, we have a few return statements with a value, despite this being a void function. We clean those up while we are making changes here. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13869 --- module/zfs/dbuf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index b2d1b956878..80cab8177bc 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3254,7 +3254,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - return (dbuf_prefetch_fini(dpa, B_TRUE)); + dbuf_prefetch_fini(dpa, B_TRUE); + return; } ASSERT(zio == NULL || zio->io_error == 0); @@ -3287,7 +3288,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { arc_buf_destroy(abuf, private); - return (dbuf_prefetch_fini(dpa, B_TRUE)); + dbuf_prefetch_fini(dpa, B_TRUE); + return; } (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); @@ -3305,7 +3307,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { + arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); + return; } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); From 4a6e8b99f5171705466b5a9542b47a935cad793d Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Wed, 14 Sep 2022 15:50:03 -0400 Subject: [PATCH 35/69] Add assertion to dsl_dataset_set_compression_sync Coverity pointed out that if we somehow receive SPA_FEATURE_NONE, we will use a negative number as an array index. A defensive assertion seems appropriate. Reviewed-by: Alexander Motin Reviewed-by: Neal Gompa Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13872 --- module/zfs/dsl_dataset.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 44da6a3f0d4..94b77aa1b74 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -4519,6 +4519,7 @@ dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); spa_feature_t f = zio_compress_to_feature(compval); + ASSERT3S(f, !=, SPA_FEATURE_NONE); ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); From ccec88f11a44746f78f88aac90f5172a52e04506 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Wed, 14 Sep 2022 15:51:55 -0400 Subject: [PATCH 36/69] FreeBSD: Fix integer conversion for vnlru_free{,_vfsops}() When reviewing #13875, I noticed that our FreeBSD code has an issue where it converts from `int64_t` to `int` when calling `vnlru_free{,_vfsops}()`. The result is that if the int64_t is `1 << 36`, the int will be 0, since the low bits are 0. Even when some low bits are set, a value such as `((1 << 36) + 1)` would truncate to 1, which is wrong. There is protection against this on 32-bit platforms, but on 64-bit platforms, there is no check to protect us, so we add a check. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Richard Yao Closes #13882 --- module/os/freebsd/zfs/arc_os.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index b4833adedcc..f1a3a0fafa9 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -142,6 +142,12 @@ arc_prune_task(void *arg) int64_t nr_scan = (intptr_t)arg; arc_reduce_target_size(ptob(nr_scan)); + +#ifndef __ILP32__ + if (nr_scan > INT_MAX) + nr_scan = INT_MAX; +#endif + #if __FreeBSD_version >= 1300139 sx_xlock(&arc_vnlru_lock); vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); From 6f8602a5ede2c156f41630ba687701262f1350d6 Mon Sep 17 00:00:00 2001 From: George Melikov Date: Thu, 15 Sep 2022 02:26:57 +0300 Subject: [PATCH 37/69] CI: revert `--with-config=dist` to hotfix Ubuntu 20.04 Recently Github action runners started to fail on kmod build. Revert --with-config=dist from ./configure section of github runners to stabilize CI for now. Reviewed-by: Brian Behlendorf Signed-off-by: George Melikov Closes #13894 --- .github/workflows/zfs-tests-functional.yml | 2 +- .github/workflows/zfs-tests-sanity.yml | 2 +- .github/workflows/zloop.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/zfs-tests-functional.yml b/.github/workflows/zfs-tests-functional.yml index 328cb97f10e..0273610af04 100644 --- a/.github/workflows/zfs-tests-functional.yml +++ b/.github/workflows/zfs-tests-functional.yml @@ -28,7 +28,7 @@ jobs: ./autogen.sh - name: Configure run: | - ./configure --enable-debug --enable-debuginfo --enable-asan --enable-ubsan --with-config=dist + ./configure --enable-debug --enable-debuginfo --enable-asan --enable-ubsan - name: Make run: | make -j$(nproc) --no-print-directory --silent pkg-utils pkg-kmod diff --git a/.github/workflows/zfs-tests-sanity.yml b/.github/workflows/zfs-tests-sanity.yml index 4c15cecf58d..73606f909e1 100644 --- a/.github/workflows/zfs-tests-sanity.yml +++ b/.github/workflows/zfs-tests-sanity.yml @@ -24,7 +24,7 @@ jobs: ./autogen.sh - name: Configure run: | - ./configure --enable-debug --enable-debuginfo --enable-asan --enable-ubsan --with-config=dist + ./configure --enable-debug --enable-debuginfo --enable-asan --enable-ubsan - name: Make run: | make -j$(nproc) --no-print-directory --silent pkg-utils pkg-kmod diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml index 64fe96a3ab6..d49eeae1653 100644 --- a/.github/workflows/zloop.yml +++ b/.github/workflows/zloop.yml @@ -23,7 +23,7 @@ jobs: ./autogen.sh - name: Configure run: | - ./configure --enable-debug --enable-debuginfo --enable-asan --enable-ubsan --with-config=dist + ./configure --enable-debug --enable-debuginfo --enable-asan --enable-ubsan - name: Make run: | make -j$(nproc) --no-print-directory --silent pkg-utils pkg-kmod From fd8c3012b3eedc6eed3dda67bf71cfb243400128 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 14:46:42 -0400 Subject: [PATCH 38/69] Fix use-after-free bugs in icp code These were reported by Coverity as "Read from pointer after free" bugs. Presumably, it did not report it as a use-after-free bug because it does not understand the inline assembly that implements the atomic instruction. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13881 --- module/icp/core/kcf_mech_tabs.c | 2 +- module/icp/core/kcf_prov_tabs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c index 3d5063b28f6..41705e84bc4 100644 --- a/module/icp/core/kcf_mech_tabs.c +++ b/module/icp/core/kcf_mech_tabs.c @@ -342,8 +342,8 @@ kcf_remove_mech_provider(const char *mech_name, kcf_provider_desc_t *prov_desc) mech_entry->me_sw_prov = NULL; /* free entry */ - KCF_PROV_REFRELE(prov_mech->pm_prov_desc); KCF_PROV_IREFRELE(prov_mech->pm_prov_desc); + KCF_PROV_REFRELE(prov_mech->pm_prov_desc); kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t)); } diff --git a/module/icp/core/kcf_prov_tabs.c b/module/icp/core/kcf_prov_tabs.c index 865d4e19c6e..93af61a235d 100644 --- a/module/icp/core/kcf_prov_tabs.c +++ b/module/icp/core/kcf_prov_tabs.c @@ -158,8 +158,8 @@ kcf_prov_tab_rem_provider(crypto_provider_id_t prov_id) * at that time. */ - KCF_PROV_REFRELE(prov_desc); KCF_PROV_IREFRELE(prov_desc); + KCF_PROV_REFRELE(prov_desc); return (CRYPTO_SUCCESS); } From 621a7ebe5818033527d67564e538f1dc0caf5e22 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 14:50:19 -0400 Subject: [PATCH 39/69] Add coverity model to repository Other projects such as the python project include their coverity models in their repositories. This provides transparency, which is beneficial in open source projects. Therefore, it is a good idea to include the coverity model in our repository too. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13884 --- contrib/coverity/model.c | 407 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 407 insertions(+) create mode 100644 contrib/coverity/model.c diff --git a/contrib/coverity/model.c b/contrib/coverity/model.c new file mode 100644 index 00000000000..ee2d01e7f3c --- /dev/null +++ b/contrib/coverity/model.c @@ -0,0 +1,407 @@ +/* + * Coverity Scan model + * https://scan.coverity.com/models + * + * This is a modeling file for Coverity Scan. + * Modeling helps to avoid false positives. + * + * - Modeling doesn't need full structs and typedefs. Rudimentary structs + * and similar types are sufficient. + * - An uninitialized local pointer is not an error. It signifies that the + * variable could be either NULL or have some data. + * + * Coverity Scan doesn't pick up modifications automatically. The model file + * must be uploaded by an admin in the analysis settings. + * + * Some of this initially cribbed from: + * + * https://github.com/kees/coverity-linux/blob/trunk/model.c + * + * The below model was based on the original model by Brian Behlendorf for the + * original zfsonlinux/zfs repository. Some inspiration was taken from + * kees/coverity-linux, specifically involving memory copies. + */ + +#include + +#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ +#define UMEM_NOFAIL 0x0100 /* Never fails */ + +#define NULL (0) + +int condition0, condition1; + +void +abort() +{ + __coverity_panic__(); +} + +void +exit(int status) +{ + (void) status; + + __coverity_panic__(); +} + +void +_exit(int status) +{ + (void) status; + + __coverity_panic__(); +} + +void +zed_log_die(const char *fmt, ...) +{ + __coverity_format_string_sink__(fmt); + __coverity_panic__(); +} + +void +panic(const char *fmt, ...) +{ + __coverity_format_string_sink__(fmt); + __coverity_panic__(); +} + +void +vpanic(const char *fmt, va_list adx) +{ + (void) fmt; + (void) adx; + + __coverity_panic__(); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + __coverity_tainted_data_argument__(from); + __coverity_tainted_data_argument__(to); + __coverity_writeall__(to); +} + +void * +memset(void *dst, int c, size_t len) +{ + __coverity_writeall__(dst); + return (dst); +} + +void * +memmove(void *dst, void *src, size_t len) +{ + __coverity_writeall__(dst); + return (dst); +} + +void * +memcpy(void *dst, void *src, size_t len) +{ + __coverity_writeall__(dst); + return (dst); +} + +void * +umem_alloc_aligned(size_t size, size_t align, int kmflags) +{ + (void) align; + + if (UMEM_NOFAIL & kmflags == UMEM_NOFAIL) + return (__coverity_alloc__(size)); + else if (condition0) + return (__coverity_alloc__(size)); + else + return (NULL); +} + +void * +umem_alloc(size_t size, int kmflags) +{ + if (UMEM_NOFAIL & kmflags == UMEM_NOFAIL) + return (__coverity_alloc__(size)); + else if (condition0) + return (__coverity_alloc__(size)); + else + return (NULL); +} + +void * +umem_zalloc(size_t size, int kmflags) +{ + if (UMEM_NOFAIL & kmflags == UMEM_NOFAIL) + return (__coverity_alloc__(size)); + else if (condition0) + return (__coverity_alloc__(size)); + else + return (NULL); +} + +void +umem_free(void *buf, size_t size) +{ + (void) size; + + __coverity_free__(buf); +} + +void * +spl_kmem_alloc(size_t sz, int fl, const char *func, int line) +{ + (void) func; + (void) line; + + if (condition1) + __coverity_sleep__(); + + if (fl == 0) { + return (__coverity_alloc__(sz)); + } else if (condition0) + return (__coverity_alloc__(sz)); + else + return (NULL); +} + +void * +spl_kmem_zalloc(size_t sz, int fl, const char *func, int line) +{ + (void) func; + (void) line; + + if (condition1) + __coverity_sleep__(); + + if (fl == 0) { + return (__coverity_alloc__(sz)); + } else if (condition0) + return (__coverity_alloc__(sz)); + else + return (NULL); +} + +void +spl_kmem_free(const void *ptr, size_t sz) +{ + (void) sz; + + __coverity_free__(ptr); +} + +typedef struct {} spl_kmem_cache_t; + +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + (void) skc; + + if (condition1) + __coverity_sleep__(); + + if (flags == 0) { + return (__coverity_alloc_nosize__()); + } else if (condition0) + return (__coverity_alloc_nosize__()); + else + return (NULL); +} + +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + (void) skc; + + __coverity_free__(obj); +} + +void +malloc(size_t size) +{ + __coverity_alloc__(size); +} + +void +free(void *buf) +{ + __coverity_free__(buf); +} + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + __coverity_format_string_sink__(fmt); + __coverity_panic__(); +} + +int +sched_yield(void) +{ + __coverity_sleep__(); +} + +typedef struct {} kmutex_t; +typedef struct {} krwlock_t; +typedef int krw_t; + +/* + * Coverty reportedly does not support macros, so this only works for + * userspace. + */ + +void +mutex_enter(kmutex_t *mp) +{ + if (condition0) + __coverity_sleep__(); + + __coverity_exclusive_lock_acquire__(mp); +} + +int +mutex_tryenter(kmutex_t *mp) +{ + if (condition0) { + __coverity_exclusive_lock_acquire__(mp); + return (1); + } + + return (0); +} + +void +mutex_exit(kmutex_t *mp) +{ + __coverity_exclusive_lock_release__(mp); +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ + (void) rw; + + if (condition0) + __coverity_sleep__(); + + __coverity_recursive_lock_acquire__(rwlp); +} + +void +rw_exit(krwlock_t *rwlp) +{ + __coverity_recursive_lock_release__(rwlp); + +} + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + if (condition0) { + __coverity_recursive_lock_acquire__(rwlp); + return (1); + } + + return (0); +} + +/* Thus, we fallback to the Linux kernel locks */ +struct {} mutex; +struct {} rw_semaphore; + +void +mutex_lock(struct mutex *lock) +{ + if (condition0) { + __coverity_sleep__(); + } + __coverity_exclusive_lock_acquire__(lock); +} + +void +mutex_unlock(struct mutex *lock) +{ + __coverity_exclusive_lock_release__(lock); +} + +void +down_read(struct rw_semaphore *sem) +{ + if (condition0) { + __coverity_sleep__(); + } + __coverity_recursive_lock_acquire__(sem); +} + +void +down_write(struct rw_semaphore *sem) +{ + if (condition0) { + __coverity_sleep__(); + } + __coverity_recursive_lock_acquire__(sem); +} + +int +down_read_trylock(struct rw_semaphore *sem) +{ + if (condition0) { + __coverity_recursive_lock_acquire__(sem); + return (1); + } + + return (0); +} + +int +down_write_trylock(struct rw_semaphore *sem) +{ + if (condition0) { + __coverity_recursive_lock_acquire__(sem); + return (1); + } + + return (0); +} + +void +up_read(struct rw_semaphore *sem) +{ + __coverity_recursive_lock_release__(sem); +} + +void +up_write(struct rw_semaphore *sem) +{ + __coverity_recursive_lock_release__(sem); +} + +int +__cond_resched(void) +{ + if (condition0) { + __coverity_sleep__(); + } +} + +/* + * An endian-independent filesystem must support doing byte swaps on data. We + * attempt to suppress taint warnings, which are false positives for us. + */ +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + __coverity_tainted_data_sanitize__(vbuf); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + __coverity_tainted_data_sanitize__(vbuf); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + __coverity_tainted_data_sanitize__(vbuf); +} From dc2fe24ca22392a589fbafdf15e4c32f42442006 Mon Sep 17 00:00:00 2001 From: John Wren Kennedy Date: Thu, 15 Sep 2022 14:14:35 -0600 Subject: [PATCH 40/69] ZTS: parameter expansion in zfs_unshare_006_pos zfs_unshare_006 checks to see if a dataset still has an active SMB share after doing an NFS unshare -a. The test could fail because the check for the SMB share does not expect dashes in a dataset name to be converted to underscores as pathname delimiters are. Reviewed-by: Tony Nguyen Signed-off-by: John Kennedy Closes #13893 --- tests/zfs-tests/include/libtest.shlib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 435dcb81c3c..d163fc7c8cc 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1265,7 +1265,7 @@ function is_shared_smb datasetexists "$fs" || return if is_linux; then - net usershare list | grep -xFq "${fs//\//_}" + net usershare list | grep -xFq "${fs//[-\/]/_}" else log_note "SMB on $UNAME currently unsupported by the test framework" return 1 From e949d36040e5e79fe0dfda6a33451111cc5a0476 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 16:24:00 -0400 Subject: [PATCH 41/69] Fix assertions in crypto reference helpers The assertions are racy and the use of `membar_exit()` did nothing to fix that. The helpers use atomic functions, so we cleverly get values from the atomics that we can use to ensure that the assertions operate on the correct values. We also use `membar_producer()` prior to decrementing reference counts so that operations that happened prior to a decrement to 0 will be guaranteed to happen before the decrement on architectures that reorder atomics. This also slightly improves performance by eliminating unnecessary reads, although I doubt it would be measurable in any benchmark. Reviewed-by: Mateusz Guzik Signed-off-by: Richard Yao Closes #13880 --- module/icp/include/sys/crypto/impl.h | 42 +++++++++++----------- module/icp/include/sys/crypto/sched_impl.h | 7 ++-- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/module/icp/include/sys/crypto/impl.h b/module/icp/include/sys/crypto/impl.h index 32ac43475a3..4d17221ea9a 100644 --- a/module/icp/include/sys/crypto/impl.h +++ b/module/icp/include/sys/crypto/impl.h @@ -126,28 +126,26 @@ typedef struct kcf_provider_desc { crypto_provider_id_t pd_prov_id; } kcf_provider_desc_t; -/* atomic operations in linux implicitly form a memory barrier */ -#define membar_exit() - /* * If a component has a reference to a kcf_provider_desc_t, * it REFHOLD()s. A new provider descriptor which is referenced only * by the providers table has a reference counter of one. */ -#define KCF_PROV_REFHOLD(desc) { \ - atomic_add_32(&(desc)->pd_refcnt, 1); \ - ASSERT((desc)->pd_refcnt != 0); \ +#define KCF_PROV_REFHOLD(desc) { \ + int newval = atomic_add_32_nv(&(desc)->pd_refcnt, 1); \ + ASSERT(newval != 0); \ } -#define KCF_PROV_IREFHOLD(desc) { \ - atomic_add_32(&(desc)->pd_irefcnt, 1); \ - ASSERT((desc)->pd_irefcnt != 0); \ +#define KCF_PROV_IREFHOLD(desc) { \ + int newval = atomic_add_32_nv(&(desc)->pd_irefcnt, 1); \ + ASSERT(newval != 0); \ } #define KCF_PROV_IREFRELE(desc) { \ - ASSERT((desc)->pd_irefcnt != 0); \ - membar_exit(); \ - if (atomic_add_32_nv(&(desc)->pd_irefcnt, -1) == 0) { \ + membar_producer(); \ + int newval = atomic_add_32_nv(&(desc)->pd_irefcnt, -1); \ + ASSERT(newval != -1); \ + if (newval == 0) { \ cv_broadcast(&(desc)->pd_remove_cv); \ } \ } @@ -155,9 +153,10 @@ typedef struct kcf_provider_desc { #define KCF_PROV_REFHELD(desc) ((desc)->pd_refcnt >= 1) #define KCF_PROV_REFRELE(desc) { \ - ASSERT((desc)->pd_refcnt != 0); \ - membar_exit(); \ - if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) { \ + membar_producer(); \ + int newval = atomic_add_32_nv(&(desc)->pd_refcnt, -1); \ + ASSERT(newval != -1); \ + if (newval == 0) { \ kcf_provider_zero_refcnt((desc)); \ } \ } @@ -193,9 +192,9 @@ typedef struct kcf_mech_entry { * it REFHOLD()s. A new policy descriptor which is referenced only * by the policy table has a reference count of one. */ -#define KCF_POLICY_REFHOLD(desc) { \ - atomic_add_32(&(desc)->pd_refcnt, 1); \ - ASSERT((desc)->pd_refcnt != 0); \ +#define KCF_POLICY_REFHOLD(desc) { \ + int newval = atomic_add_32_nv(&(desc)->pd_refcnt, 1); \ + ASSERT(newval != 0); \ } /* @@ -203,9 +202,10 @@ typedef struct kcf_mech_entry { * reference is released, the descriptor is freed. */ #define KCF_POLICY_REFRELE(desc) { \ - ASSERT((desc)->pd_refcnt != 0); \ - membar_exit(); \ - if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) \ + membar_producer(); \ + int newval = atomic_add_32_nv(&(desc)->pd_refcnt, -1); \ + ASSERT(newval != -1); \ + if (newval == 0) \ kcf_policy_free_desc(desc); \ } diff --git a/module/icp/include/sys/crypto/sched_impl.h b/module/icp/include/sys/crypto/sched_impl.h index 1989d5244e2..355c1a87faa 100644 --- a/module/icp/include/sys/crypto/sched_impl.h +++ b/module/icp/include/sys/crypto/sched_impl.h @@ -73,9 +73,10 @@ typedef struct kcf_context { * context structure is freed along with the global context. */ #define KCF_CONTEXT_REFRELE(ictx) { \ - ASSERT((ictx)->kc_refcnt != 0); \ - membar_exit(); \ - if (atomic_add_32_nv(&(ictx)->kc_refcnt, -1) == 0) \ + membar_producer(); \ + int newval = atomic_add_32_nv(&(ictx)->kc_refcnt, -1); \ + ASSERT(newval != -1); \ + if (newval == 0) \ kcf_free_context(ictx); \ } From fa22ec569c093d5583a7f406ba0a9bb223eae436 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Thu, 15 Sep 2022 23:22:00 +0200 Subject: [PATCH 42/69] Use correct mdoc macros for arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Brian Behlendorf Reviewed-by: Ahelenia Ziemiańska Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org> Closes #13890 --- man/man7/zpoolprops.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index a150f6d4370..2164c126011 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -177,7 +177,7 @@ changed with the .Nm zpool Cm set command: .Bl -tag -width Ds -.It Sy ashift Ns = Ns Sy ashift +.It Sy ashift Ns = Ns Ar ashift Pool sector size exponent, to the power of .Sy 2 (internally referred to as From ddb1fd91c0dbf64847235ee65e50e87c43257b05 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 19:21:21 -0400 Subject: [PATCH 43/69] Fix incorrect size given to bqueue_enqueue() call in dmu_redact.c We pass sizeof (struct redact_record *) rather than sizeof (struct redact_record). Passing the pointer size is wrong. Coverity caught this in two places. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13885 --- module/zfs/dmu_redact.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 09ca7d509ce..7afcc123134 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -142,7 +142,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build, { if (new->eos_marker) { if (*build != NULL) - bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue(q, *build, sizeof (**build)); bqueue_enqueue_flush(q, new, sizeof (*new)); return; } @@ -824,7 +824,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_destroy(&end_tree); kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) - bqueue_enqueue(q, current_record, sizeof (current_record)); + bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); } From b24d1c77f7fc53d26ee915b5203a139f13fd9791 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 19:22:33 -0400 Subject: [PATCH 44/69] Add zfs_btree_verify_intensity kernel module parameter I see a few issues in the issue tracker that might be aided by being able to turn this on. We have no module parameter for it, so I would like to add one. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13874 --- cmd/zdb/zdb.c | 2 +- man/man4/zfs.4 | 16 ++++++++++++++++ module/zfs/btree.c | 8 +++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 0fc4f0d0d1b..92df3dd167b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -121,7 +121,7 @@ extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; extern int zfs_reconstruct_indirect_combinations_max; -extern int zfs_btree_verify_intensity; +extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index cecaf7e7f0a..b2f3e7c61fb 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1354,6 +1354,22 @@ _ .TE .Sy \& * No Requires debug build. . +.It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint +Enables btree verification. +The following settings are culminative: +.TS +box; +lbz r l l . + Value Description + + 1 Verify height. + 2 Verify pointers from children to parent. + 3 Verify element counts. + 4 Verify element order. (expensive) +* 5 Verify unused memory is poisoned. (expensive) +.TE +.Sy \& * No Requires debug build. +. .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int If destroy encounters an .Sy EIO diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 60b063ed907..f0a9222a430 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -53,7 +53,7 @@ kmem_cache_t *zfs_btree_leaf_cache; * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ -int zfs_btree_verify_intensity = 0; +uint_t zfs_btree_verify_intensity = 0; /* * Convenience functions to silence warnings from memcpy/memmove's @@ -2171,3 +2171,9 @@ zfs_btree_verify(zfs_btree_t *tree) return; zfs_btree_verify_poison(tree); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, + "Enable btree verification. Levels above 4 require ZFS be built " + "with debugging"); +/* END CSTYLED */ From 768eacedef54922962562e601ca2c3366c4bcc4b Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Fri, 16 Sep 2022 13:36:47 -0700 Subject: [PATCH 45/69] zfs_enter rework Replace ZFS_ENTER and ZFS_VERIFY_ZP, which have hidden returns, with functions that return error code. The reason we want to do this is because hidden returns are not obvious and had caused some missing fail path unwinding. This patch changes the common, linux, and freebsd parts. Also fixes fail path unwinding in zfs_fsync, zpl_fsync, zpl_xattr_{list,get,set}, and zfs_lookup(). Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Chunwei Chen Closes #13831 --- include/os/freebsd/zfs/sys/zfs_znode_impl.h | 35 +- include/os/linux/zfs/sys/zfs_znode_impl.h | 60 ++-- include/sys/zfs_znode.h | 23 ++ module/os/freebsd/zfs/zfs_ctldir.c | 14 +- module/os/freebsd/zfs/zfs_vfsops.c | 48 +-- module/os/freebsd/zfs/zfs_vnops_os.c | 369 +++++++++++--------- module/os/linux/zfs/zfs_acl.c | 5 +- module/os/linux/zfs/zfs_ctldir.c | 44 ++- module/os/linux/zfs/zfs_vfsops.c | 36 +- module/os/linux/zfs/zfs_vnops_os.c | 300 ++++++++-------- module/os/linux/zfs/zpl_ctldir.c | 30 +- module/os/linux/zfs/zpl_file.c | 18 +- module/os/linux/zfs/zpl_super.c | 6 +- module/os/linux/zfs/zpl_xattr.c | 21 +- module/zfs/zfs_vnops.c | 68 ++-- 15 files changed, 591 insertions(+), 486 deletions(-) diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h index f76a841472f..41a5bb218c1 100644 --- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -121,29 +121,24 @@ typedef struct zfs_soft_state { #define zn_rlimit_fsize(zp, uio) \ vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio)) -#define ZFS_ENTER_ERROR(zfsvfs, error) do { \ - ZFS_TEARDOWN_ENTER_READ((zfsvfs), FTAG); \ - if (__predict_false((zfsvfs)->z_unmounted)) { \ - ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ - return (error); \ - } \ -} while (0) - /* Called on entry to each ZFS vnode and vfs operation */ -#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) +static inline int +zfs_enter(zfsvfs_t *zfsvfs, const char *tag) +{ + ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag); + if (__predict_false((zfsvfs)->z_unmounted)) { + ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); + return (SET_ERROR(EIO)); + } + return (0); +} /* Must be called before exiting the vop */ -#define ZFS_EXIT(zfsvfs) ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG) - -#define ZFS_VERIFY_ZP_ERROR(zp, error) do { \ - if (__predict_false((zp)->z_sa_hdl == NULL)) { \ - ZFS_EXIT((zp)->z_zfsvfs); \ - return (error); \ - } \ -} while (0) - -/* Verifies the znode is valid */ -#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) +static inline void +zfs_exit(zfsvfs_t *zfsvfs, const char *tag) +{ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); +} /* * Macros for dealing with dmu_buf_hold diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index a6fa06a3f1a..52568781011 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -84,39 +84,41 @@ extern "C" { #define zrele(zp) iput(ZTOI((zp))) /* Called on entry to each ZFS inode and vfs operation. */ -#define ZFS_ENTER_ERROR(zfsvfs, error) \ -do { \ - ZFS_TEARDOWN_ENTER_READ(zfsvfs, FTAG); \ - if (unlikely((zfsvfs)->z_unmounted)) { \ - ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ - return (error); \ - } \ -} while (0) -#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) -#define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) +static inline int +zfs_enter(zfsvfs_t *zfsvfs, const char *tag) +{ + ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag); + if (unlikely(zfsvfs->z_unmounted)) { + ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); + return (SET_ERROR(EIO)); + } + return (0); +} /* Must be called before exiting the operation. */ -#define ZFS_EXIT(zfsvfs) \ -do { \ - zfs_exit_fs(zfsvfs); \ - ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ -} while (0) +static inline void +zfs_exit(zfsvfs_t *zfsvfs, const char *tag) +{ + zfs_exit_fs(zfsvfs); + ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); +} -#define ZPL_EXIT(zfsvfs) \ -do { \ - rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ -} while (0) +static inline int +zpl_enter(zfsvfs_t *zfsvfs, const char *tag) +{ + return (-zfs_enter(zfsvfs, tag)); +} -/* Verifies the znode is valid. */ -#define ZFS_VERIFY_ZP_ERROR(zp, error) \ -do { \ - if (unlikely((zp)->z_sa_hdl == NULL)) { \ - ZFS_EXIT(ZTOZSB(zp)); \ - return (error); \ - } \ -} while (0) -#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) -#define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, -EIO) +static inline void +zpl_exit(zfsvfs_t *zfsvfs, const char *tag) +{ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); +} + +/* zfs_verify_zp and zfs_enter_verify_zp are defined in zfs_znode.h */ +#define zpl_verify_zp(zp) (-zfs_verify_zp(zp)) +#define zpl_enter_verify_zp(zfsvfs, zp, tag) \ + (-zfs_enter_verify_zp(zfsvfs, zp, tag)) /* * Macros for dealing with dmu_buf_hold diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index b223c4b3b30..7c906050bc4 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -218,6 +218,29 @@ typedef struct znode { ZNODE_OS_FIELDS; } znode_t; +/* Verifies the znode is valid. */ +static inline int +zfs_verify_zp(znode_t *zp) +{ + if (unlikely(zp->z_sa_hdl == NULL)) + return (SET_ERROR(EIO)); + return (0); +} + +/* zfs_enter and zfs_verify_zp together */ +static inline int +zfs_enter_verify_zp(zfsvfs_t *zfsvfs, znode_t *zp, const char *tag) +{ + int error; + if ((error = zfs_enter(zfsvfs, tag)) != 0) + return (error); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, tag); + return (error); + } + return (0); +} + typedef struct znode_hold { uint64_t zh_obj; /* object id */ kmutex_t zh_lock; /* lock serializing object access */ diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 2c35b74cd3f..4b95b49dc40 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -1053,7 +1053,8 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) return (error); } - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); for (;;) { uint64_t cookie; uint64_t id; @@ -1070,7 +1071,7 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) *eofp = 1; error = 0; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1083,7 +1084,7 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) if (error != 0) { if (error == ENAMETOOLONG) error = 0; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(error)); } zfs_uio_setoffset(&uio, cookie + dots_offset); @@ -1101,7 +1102,8 @@ zfsctl_snapdir_getattr(struct vop_getattr_args *ap) uint64_t snap_count; int err; - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); ds = dmu_objset_ds(zfsvfs->z_os); zfsctl_common_getattr(vp, vap); vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); @@ -1111,14 +1113,14 @@ zfsctl_snapdir_getattr(struct vop_getattr_args *ap) err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (err != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } vap->va_nlink += snap_count; } vap->va_size = vap->va_nlink; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 8b60b34d85c..b290c36748c 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -286,7 +286,8 @@ zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (id == -1) { switch (type) { case USRQUOTA: @@ -385,7 +386,7 @@ zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) break; } done: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -426,7 +427,8 @@ zfs_sync(vfs_t *vfsp, int waitfor) if (error != 0) return (error); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* @@ -434,14 +436,14 @@ zfs_sync(vfs_t *vfsp, int waitfor) * filesystems which may exist on a suspended pool. */ if (rebooting && spa_suspended(dp->dp_spa)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you @@ -1408,10 +1410,12 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; + int error; statp->f_version = STATFS_VERSION; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); @@ -1458,7 +1462,7 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) statp->f_namemax = MAXNAMELEN - 1; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1469,13 +1473,14 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) znode_t *rootzp; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (error == 0) { error = vn_lock(*vpp, flags); @@ -1712,7 +1717,8 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); @@ -1720,7 +1726,7 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) } if (err == 0) *vpp = ZTOV(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) @@ -1774,7 +1780,8 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) *vpp = NULL; - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); /* * On FreeBSD we can get snapshot's mount point or its parent file @@ -1790,12 +1797,13 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { @@ -1807,7 +1815,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -1825,7 +1833,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; @@ -1860,7 +1868,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) (u_longlong_t)fid_gen, (u_longlong_t)gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, @@ -1872,12 +1880,12 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) dprintf("znode gen (%llu) != fid gen (%llu)\n", (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); vrele(ZTOV(zp)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); @@ -1945,7 +1953,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let - * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * any potential callers discover that via zfs_enter_verify_zp * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index f0579626c5a..57889b7390e 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -135,13 +135,13 @@ typedef ulong_t cookie_t; * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * This is done avoiding races using zfs_enter(zfsvfs). + * A zfs_exit(zfsvfs) is needed before all returns. Any znodes + * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * (if necessary) and zfs_exit(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- @@ -157,7 +157,7 @@ typedef ulong_t cookie_t; * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). @@ -192,7 +192,7 @@ typedef ulong_t cookie_t; * * In general, this is how things should be ordered in each vnode op: * - * ZFS_ENTER(zfsvfs); // exit if unmounted + * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need @@ -210,7 +210,7 @@ typedef ulong_t cookie_t; * goto top; * } * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs + * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does @@ -221,7 +221,7 @@ typedef ulong_t cookie_t; * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs + * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ static int @@ -230,13 +230,14 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr) (void) cr; znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -244,7 +245,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr) if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -254,15 +255,16 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) (void) offset, (void) cr; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* Decrement the synchronous opens in the znode */ if ((flag & O_SYNC) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -800,8 +802,8 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, const char *, nm); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); + if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) + return (error); #if __FreeBSD_version > 1300124 dvp_seqc = vn_seqc_read_notmodify(dvp); @@ -814,7 +816,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EOPNOTSUPP)); } @@ -823,12 +825,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } *vpp = ZTOV(zp); @@ -841,7 +843,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, vrele(ZTOV(zp)); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -856,14 +858,14 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, } else #endif if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } @@ -881,7 +883,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, vnode_t *zfsctl_vp; int ltype; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK1(dvp); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, @@ -900,7 +902,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); @@ -918,7 +920,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, if (error == 0) *vpp = ZTOV(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (error != 0) break; @@ -936,7 +938,11 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, if ((cnp->cn_flags & ISDOTDOT) == 0) break; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) { + vput(ZTOV(zp)); + *vpp = NULL; + return (error); + } if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { @@ -944,12 +950,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, &parent, sizeof (parent)); } if (error != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); break; } vput(ZTOV(zp)); @@ -1066,21 +1072,21 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -1092,7 +1098,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } ASSERT3P(zp, ==, NULL); @@ -1150,7 +1156,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); @@ -1175,7 +1181,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1210,10 +1216,13 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zp = VTOZ(vp); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } zilog = zfsvfs->z_log; xattr_obj = 0; @@ -1271,7 +1280,7 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1303,7 +1312,7 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1408,32 +1417,32 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1448,20 +1457,20 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } ASSERT3P(zp, ==, NULL); if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } @@ -1488,7 +1497,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1520,7 +1529,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1561,9 +1570,12 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) dmu_tx_t *tx; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } zilog = zfsvfs->z_log; @@ -1588,7 +1600,7 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1607,7 +1619,7 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1677,12 +1689,12 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, cookie_t *cooks = NULL; int flags = 0; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1697,7 +1709,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, * Check for valid iov_len. */ if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -1705,7 +1717,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1930,7 +1942,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_uio_setoffset(uio, offset); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; @@ -1968,8 +1980,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) sa_bulk_attr_t bulk[4]; int count = 0; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); @@ -1981,7 +1993,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1994,7 +2006,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) (vap->va_uid != crgetuid(cr))) { if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -2145,7 +2157,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) vap->va_blksize = zfsvfs->z_max_blksz; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -2203,8 +2215,8 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (err); os = zfsvfs->z_os; zilog = zfsvfs->z_log; @@ -2218,17 +2230,17 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -2246,7 +2258,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -2263,27 +2275,27 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EOVERFLOW)); } } if (xoap != NULL && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EOVERFLOW)); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EOPNOTSUPP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -2298,7 +2310,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EOPNOTSUPP)); } } @@ -2307,7 +2319,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) aclp = NULL; if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } @@ -2325,7 +2337,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } } @@ -2473,7 +2485,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -2489,7 +2501,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } trim_mask |= AT_MODE; @@ -2521,7 +2533,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -2879,7 +2891,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -2904,14 +2916,17 @@ zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp, * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. - * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); - ZFS_VERIFY_ZP(tdzp); + if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) + return (error); + if ((error = zfs_verify_zp(tdzp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Re-resolve svp to be certain it still exists and fetch the @@ -2939,7 +2954,7 @@ zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp, } *tzpp = tzp; out: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3209,9 +3224,12 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(tdzp); - ZFS_VERIFY_ZP(sdzp); + if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) + return (error); + if ((error = zfs_verify_zp(sdzp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(tnm, @@ -3234,10 +3252,17 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, } szp = VTOZ(*svpp); - ZFS_VERIFY_ZP(szp); + if ((error = zfs_verify_zp(szp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); - if (tzp != NULL) - ZFS_VERIFY_ZP(tzp); + if (tzp != NULL) { + if ((error = zfs_verify_zp(tzp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + } /* * This is to prevent the creation of links into attribute space @@ -3412,7 +3437,7 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, out: if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3487,24 +3512,24 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, ASSERT3S(vap->va_type, ==, VLNK); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3514,20 +3539,20 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } @@ -3550,7 +3575,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3589,7 +3614,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3617,8 +3642,8 @@ zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, @@ -3628,7 +3653,7 @@ zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3661,8 +3686,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(tdzp); + if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; /* @@ -3670,11 +3695,14 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, * Better choices include ENOTSUP or EISDIR. */ if (ZTOV(szp)->v_type == VDIR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } - ZFS_VERIFY_ZP(szp); + if ((error = zfs_verify_zp(szp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * If we are using project inheritance, means if the directory has @@ -3685,13 +3713,13 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -3699,17 +3727,17 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } @@ -3720,19 +3748,19 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3741,7 +3769,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, */ error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); if (error) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3753,7 +3781,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3773,7 +3801,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3804,11 +3832,11 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, uint64_t off, len; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -3817,12 +3845,12 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -3833,7 +3861,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3842,7 +3870,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, error = zfs_freesp(zp, off, len, flag, TRUE); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3910,12 +3938,12 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) zfid_short_t *zfid; int size, i, error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3951,7 +3979,7 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) zlfid->zf_setgen[i] = 0; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3961,6 +3989,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, { znode_t *zp; zfsvfs_t *zfsvfs; + int error; switch (cmd) { case _PC_LINK_MAX: @@ -3977,10 +4006,10 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); #else *valp = 0; #endif @@ -3989,10 +4018,10 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, case _PC_ACL_NFS4: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); case _PC_ACL_PATH_MAX: @@ -4017,8 +4046,8 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int pgsin_b, pgsin_a; int error; - ZFS_ENTER_ERROR(zfsvfs, zfs_vm_pagerret_error); - ZFS_VERIFY_ZP_ERROR(zp, zfs_vm_pagerret_error); + if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0) + return (zfs_vm_pagerret_error); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); @@ -4055,7 +4084,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { if (lr != NULL) zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (zfs_vm_pagerret_bad); } @@ -4088,7 +4117,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (error != 0) return (zfs_vm_pagerret_error); @@ -4151,8 +4180,8 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; - ZFS_ENTER_ERROR(zfsvfs, zfs_vm_pagerret_error); - ZFS_VERIFY_ZP_ERROR(zp, zfs_vm_pagerret_error); + if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0) + return (zfs_vm_pagerret_error); off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; @@ -4267,7 +4296,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (rtvals[0]); } @@ -5425,9 +5454,9 @@ zfs_getextattr(struct vop_getextattr_args *ap) if (error != 0) return (error); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); error = ENOENT; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); error = zfs_getextattr_impl(ap, zfs_xattr_compat); @@ -5441,7 +5470,7 @@ zfs_getextattr(struct vop_getextattr_args *ap) } rw_exit(&zp->z_xattr_lock); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (error == ENOENT) error = SET_ERROR(ENOATTR); return (error); @@ -5568,8 +5597,8 @@ zfs_deleteextattr(struct vop_deleteextattr_args *ap) if (error != 0) return (error); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); rw_enter(&zp->z_xattr_lock, RW_WRITER); error = zfs_deleteextattr_impl(ap, zfs_xattr_compat); @@ -5583,7 +5612,7 @@ zfs_deleteextattr(struct vop_deleteextattr_args *ap) } rw_exit(&zp->z_xattr_lock); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); if (error == ENOENT) error = SET_ERROR(ENOATTR); return (error); @@ -5756,14 +5785,14 @@ zfs_setextattr(struct vop_setextattr_args *ap) if (error != 0) return (error); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); rw_enter(&zp->z_xattr_lock, RW_WRITER); error = zfs_setextattr_impl(ap, zfs_xattr_compat); rw_exit(&zp->z_xattr_lock); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -5960,8 +5989,8 @@ zfs_listextattr(struct vop_listextattr_args *ap) if (error != 0) return (SET_ERROR(error)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); rw_enter(&zp->z_xattr_lock, RW_READER); error = zfs_listextattr_impl(ap, zfs_xattr_compat); @@ -5971,7 +6000,7 @@ zfs_listextattr(struct vop_listextattr_args *ap) } rw_exit(&zp->z_xattr_lock); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -6087,8 +6116,8 @@ zfs_vptocnp(struct vop_vptocnp_args *ap) int ltype; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* * If we are a snapshot mounted under .zfs, run the operation @@ -6110,10 +6139,10 @@ zfs_vptocnp(struct vop_vptocnp_args *ap) memcpy(ap->a_buf + *ap->a_buflen, name, len); *ap->a_vpp = ZTOV(dzp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); covered_vp = vp->v_mount->mnt_vnodecovered; #if __FreeBSD_version >= 1300045 @@ -6154,15 +6183,15 @@ zfs_deallocate(struct vop_deallocate_args *ap) off_t off, len, file_sz; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } @@ -6175,7 +6204,7 @@ zfs_deallocate(struct vop_deallocate_args *ap) /* Fast path for out-of-range request. */ if (len <= 0) { *ap->a_len = 0; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -6188,7 +6217,7 @@ zfs_deallocate(struct vop_deallocate_args *ap) *ap->a_len = 0; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } #endif diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index a139ee12c4d..4fd071d3cb2 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -2596,9 +2596,10 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) slow: DTRACE_PROBE(zfs__fastpath__execute__access__miss); - ZFS_ENTER(ZTOZSB(zdp)); + if ((error = zfs_enter(ZTOZSB(zdp), FTAG)) != 0) + return (error); error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); - ZFS_EXIT(ZTOZSB(zdp)); + zfs_exit(ZTOZSB(zdp), FTAG); return (error); } diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 32342d25ce6..4ae0a65370e 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -673,17 +673,19 @@ zfsctl_fid(struct inode *ip, fid_t *fidp) uint64_t object = zp->z_id; zfid_short_t *zfid; int i; + int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (zfsctl_is_snapdir(ip)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (zfsctl_snapdir_fid(ip, fidp)); } if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } @@ -698,7 +700,7 @@ zfsctl_fid(struct inode *ip, fid_t *fidp) for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -776,7 +778,8 @@ zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, zfsvfs_t *zfsvfs = ITOZSB(dip); int error = 0; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (strcmp(name, "..") == 0) { *ipp = dip->i_sb->s_root->d_inode; @@ -793,7 +796,7 @@ zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, if (*ipp == NULL) error = SET_ERROR(ENOENT); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -810,11 +813,12 @@ zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, uint64_t id; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); if (error) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -823,7 +827,7 @@ zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, if (*ipp == NULL) error = SET_ERROR(ENOENT); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -844,7 +848,8 @@ zfsctl_snapdir_rename(struct inode *sdip, const char *snm, if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -902,7 +907,7 @@ zfsctl_snapdir_rename(struct inode *sdip, const char *snm, kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -922,7 +927,8 @@ zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -951,7 +957,7 @@ zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1076,7 +1082,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) return (SET_ERROR(EISDIR)); zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); @@ -1164,7 +1171,7 @@ zfsctl_snapshot_mount(struct path *path, int flags) kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1228,10 +1235,11 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, znode_t *dzp; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } @@ -1240,7 +1248,7 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, zrele(dzp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index d0575fe5e98..251d9e9a40f 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -273,8 +273,10 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) * Sync a specific filesystem. */ dsl_pool_t *dp; + int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* @@ -282,14 +284,14 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you @@ -1092,7 +1094,8 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp) uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); @@ -1153,7 +1156,7 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp) err = zfs_statfs_project(zfsvfs, zp, statp, bshift); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -1163,13 +1166,14 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) znode_t *rootzp; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *ipp = ZTOI(rootzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1247,7 +1251,8 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) .gfp_mask = GFP_KERNEL, }; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ @@ -1288,7 +1293,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #endif - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "pruning, nr_to_scan=%lu objects=%d error=%d\n", @@ -1745,7 +1750,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); } - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { @@ -1761,7 +1767,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) */ VERIFY3P(igrab(*ipp), !=, NULL); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1769,14 +1775,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { zrele(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } @@ -1791,7 +1797,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); zrele(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } @@ -1799,7 +1805,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) if (*ipp) zfs_znode_update_vfs(ITOZ(*ipp)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 0b3f7f2501e..1ff88c121a7 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -82,13 +82,13 @@ * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * This is done avoiding races using zfs_enter(zfsvfs). + * A zfs_exit(zfsvfs) is needed before all returns. Any znodes + * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if - * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the + * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires @@ -107,7 +107,7 @@ * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). @@ -142,7 +142,7 @@ * * In general, this is how things should be ordered in each vnode op: * - * ZFS_ENTER(zfsvfs); // exit if unmounted + * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need @@ -160,7 +160,7 @@ * goto top; * } * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs + * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does @@ -171,7 +171,7 @@ * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs + * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int @@ -180,14 +180,15 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* Honor ZFS_APPENDONLY file attribute */ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -195,7 +196,7 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -205,15 +206,16 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -449,8 +451,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, } } - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); + if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) + return (error); *zpp = NULL; @@ -460,12 +462,12 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -479,12 +481,12 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, *zpp = NULL; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } @@ -493,13 +495,13 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } @@ -507,7 +509,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -566,21 +568,21 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -609,7 +611,7 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -681,7 +683,7 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); @@ -774,7 +776,7 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -808,14 +810,14 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -870,7 +872,7 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); @@ -894,7 +896,7 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, *ipp = ZTOI(zp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -941,8 +943,8 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { @@ -961,7 +963,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) NULL, realnmp))) { if (realnmp) pn_free(realnmp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1042,7 +1044,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) zrele(zp); if (xzp) zrele(xzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1131,7 +1133,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1188,18 +1190,18 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, if (dirname == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) @@ -1208,14 +1210,14 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } /* @@ -1231,21 +1233,21 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } @@ -1277,7 +1279,7 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1323,7 +1325,7 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1359,8 +1361,8 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) @@ -1373,7 +1375,7 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1424,7 +1426,7 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, } dmu_tx_abort(tx); zrele(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1452,7 +1454,7 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1491,8 +1493,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) @@ -1611,7 +1613,7 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) if (error == ENOENT) error = 0; out: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1636,9 +1638,10 @@ zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); mutex_enter(&zp->z_lock); @@ -1673,7 +1676,7 @@ zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, dmu_objset_id(zfsvfs->z_os); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1849,8 +1852,8 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (mask == 0) return (0); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (err); ip = ZTOI(zp); /* @@ -1862,13 +1865,13 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -1883,7 +1886,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } @@ -1899,17 +1902,17 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -2526,7 +2529,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -2661,11 +2664,14 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); + if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; - ZFS_VERIFY_ZP(tdzp); + if ((error = zfs_verify_zp(tdzp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * We check i_sb because snapshots and the ctldir must have different @@ -2673,13 +2679,13 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } @@ -2697,7 +2703,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -2727,7 +2733,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * the rename() function shall return successfully * and perform no other action." */ - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } /* @@ -2799,7 +2805,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, if (strcmp(snm, "..") == 0) serr = EINVAL; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { @@ -2811,7 +2817,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, if (strcmp(tnm, "..") == 0) terr = EINVAL; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (terr); } @@ -2915,7 +2921,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, zrele(szp); if (tzp) zrele(tzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -2989,7 +2995,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3032,26 +3038,26 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } top: @@ -3063,21 +3069,21 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); @@ -3104,7 +3110,7 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3159,7 +3165,7 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, zrele(zp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3185,8 +3191,8 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) zfsvfs_t *zfsvfs = ITOZSB(ip); int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) @@ -3196,7 +3202,7 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3241,8 +3247,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(tdzp); + if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; /* @@ -3250,11 +3256,14 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } - ZFS_VERIFY_ZP(szp); + if ((error = zfs_verify_zp(szp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * If we are using project inheritance, means if the directory has @@ -3265,7 +3274,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } @@ -3274,7 +3283,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } @@ -3282,17 +3291,17 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) @@ -3305,19 +3314,19 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3327,7 +3336,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3349,7 +3358,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, goto top; } dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ @@ -3391,7 +3400,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3448,8 +3457,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, int cnt = 0; struct address_space *mapping; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (err); ASSERT(PageLocked(pp)); @@ -3461,7 +3470,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3521,7 +3530,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3549,7 +3558,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, #endif } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3557,7 +3566,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3592,7 +3601,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -3643,7 +3652,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -3665,8 +3674,8 @@ zfs_dirty_inode(struct inode *ip, int flags) if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); #ifdef I_DIRTY_TIME /* @@ -3714,7 +3723,7 @@ zfs_dirty_inode(struct inode *ip, int flags) dmu_tx_commit(tx); out: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3831,14 +3840,14 @@ zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) if (pl == NULL) return (0); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (err); err = zfs_fillpage(ip, pl, nr_pages); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -3861,28 +3870,29 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3913,11 +3923,11 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, uint64_t off, len; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -3926,12 +3936,12 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -3942,7 +3952,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3951,7 +3961,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, error = zfs_freesp(zp, off, len, flag, TRUE); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3966,19 +3976,23 @@ zfs_fid(struct inode *ip, fid_t *fidp) zfid_short_t *zfid; int size, i, error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } - ZFS_VERIFY_ZP(zp); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3999,7 +4013,7 @@ zfs_fid(struct inode *ip, fid_t *fidp) for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index ec8f2938598..1a688687ac4 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -57,7 +57,8 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); if (!zpl_dir_emit_dots(filp, ctx)) goto out; @@ -78,7 +79,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) ctx->pos++; } out: - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (error); } @@ -258,7 +259,8 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) uint64_t id, pos; int error = 0; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) @@ -282,7 +284,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) } out: spl_fstrans_unmark(cookie); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); if (error == -ENOENT) return (0); @@ -401,8 +403,10 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); #ifdef HAVE_USERNS_IOPS_GETATTR #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); @@ -422,7 +426,7 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, dmu_objset_pool(ds->ds_objset)->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (err != 0) { - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (-err); } stat->nlink += snap_count; @@ -430,7 +434,7 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (0); } @@ -508,7 +512,8 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) znode_t *dzp; int error = 0; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { @@ -527,7 +532,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); @@ -564,7 +569,8 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, znode_t *dzp; int error; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); if (zfsvfs->z_shares_dir == 0) { #ifdef HAVE_USERNS_IOPS_GETATTR @@ -578,7 +584,7 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, #endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (0); } @@ -596,7 +602,7 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, iput(ZTOI(dzp)); } - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index b0d9f37a3ec..f6bdfd08b83 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -195,9 +195,12 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * zfs_putpage() respectively. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { + atomic_dec_32(&zp->z_sync_writes_cnt); + return (error); + } zil_commit(zfsvfs->z_log, zp->z_id); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); } error = filemap_write_and_wait_range(inode->i_mapping, start, end); @@ -752,10 +755,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) enum writeback_sync_modes sync_mode; int result; - ZPL_ENTER(zfsvfs); + if ((result = zpl_enter(zfsvfs, FTAG)) != 0) + return (result); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); sync_mode = wbc->sync_mode; /* @@ -769,11 +773,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync); if (sync_mode != wbc->sync_mode) { - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (result); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); /* * We need to call write_cache_pages() again (we can't just diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index cf879a2897b..e3945a2a05f 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -185,7 +185,9 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data) static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { - ZPL_ENTER(zfsvfs); + int error; + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); @@ -205,7 +207,7 @@ __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (0); } diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index e7e299dcf1c..a010667adfa 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -246,8 +246,8 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) crhold(cr); cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out1; rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) { @@ -264,7 +264,8 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) out: rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); +out1: spl_fstrans_unmark(cookie); crfree(cr); @@ -435,12 +436,13 @@ zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) crhold(cr); cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out; rw_enter(&zp->z_xattr_lock, RW_READER); error = __zpl_xattr_get(ip, name, value, size, cr); rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); +out: spl_fstrans_unmark(cookie); crfree(cr); @@ -604,8 +606,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, crhold(cr); cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out1; rw_enter(&zp->z_xattr_lock, RW_WRITER); /* @@ -658,7 +660,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); +out1: spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index b02e8283c77..57f03f11627 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -61,21 +61,23 @@ static ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { + int error = 0; zfsvfs_t *zfsvfs = ZTOZSB(zp); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out; atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); atomic_dec_32(&zp->z_sync_writes_cnt); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); } +out: tsd_set(zfs_fsyncer_key, NULL); - return (0); + return (error); } @@ -146,12 +148,12 @@ zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); error = zfs_holey_common(zp, cmd, off); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ @@ -162,15 +164,15 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -201,17 +203,17 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } @@ -219,7 +221,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -227,7 +229,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -312,7 +314,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -404,8 +406,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); sa_bulk_attr_t bulk[4]; int count = 0; @@ -422,7 +424,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } @@ -434,7 +436,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -443,7 +445,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -455,7 +457,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * Skip this if uio contains loaned arc_buf. */ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } @@ -490,7 +492,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (zn_rlimit_fsize(zp, uio)) { zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } @@ -498,7 +500,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (woff >= limit) { zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } @@ -761,7 +763,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -773,7 +775,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -784,10 +786,10 @@ zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -800,15 +802,15 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } From 1b6f3368dd5b416753178da06cb19c32798671e6 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Fri, 16 Sep 2022 13:43:26 -0700 Subject: [PATCH 46/69] Fix unable to export zpool without nfs-utils Don't return error in nfs_disable_share when nfs is not available, since it wouldn't have been able to share in the first place. Reviewed-by: Brian Behlendorf Signed-off-by: Chunwei Chen Closes #13534 Closes #13800 --- lib/libshare/os/linux/nfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c index 0d63c989d34..c27e5564c1e 100644 --- a/lib/libshare/os/linux/nfs.c +++ b/lib/libshare/os/linux/nfs.c @@ -449,7 +449,7 @@ static int nfs_disable_share(sa_share_impl_t impl_share) { if (!nfs_available()) - return (SA_SYSTEM_ERR); + return (SA_OK); return (nfs_toggle_share( ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, ZFS_EXPORTS_DIR, impl_share, From 8da218a7a2ee0d0c0a0741f1007ebce0bf22584a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 16 Sep 2022 16:45:15 -0400 Subject: [PATCH 47/69] Update coverity model `uu_panic()` needs to be modelled and the definition of `vpanic()` from the original coverity model was missing `__coverity_format_string_sink__()`. We also model `libspl_assertf()` as part of an attempt to eliminate false positives. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13901 --- contrib/coverity/model.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/contrib/coverity/model.c b/contrib/coverity/model.c index ee2d01e7f3c..d27abd03876 100644 --- a/contrib/coverity/model.c +++ b/contrib/coverity/model.c @@ -70,9 +70,24 @@ panic(const char *fmt, ...) void vpanic(const char *fmt, va_list adx) { - (void) fmt; (void) adx; + __coverity_format_string_sink__(fmt); + __coverity_panic__(); +} + +void +uu_panic(const char *format, ...) +{ + __coverity_format_string_sink__(format); + __coverity_panic__(); +} + +int +libspl_assertf(const char *file, const char *func, int line, + const char *format, ...) +{ + __coverity_format_string_sink__(format); __coverity_panic__(); } From 577d41d3b2e4b37f51270c399c85b2708e21238a Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Sat, 17 Sep 2022 01:52:25 +0500 Subject: [PATCH 48/69] zfs recv hangs if max recordsize is less than received recordsize - Some optimizations for bqueue enqueue/dequeue. - Added a fix to prevent deadlock when both bqueue_enqueue_impl() and bqueue_dequeue() waits for signal to be triggered. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Ameer Hamza Closes #13855 --- include/sys/bqueue.h | 14 +++++++------- include/sys/fs/zfs.h | 6 +++--- module/zfs/bqueue.c | 23 +++++++++++++---------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h index 797aecd791a..b9621966027 100644 --- a/include/sys/bqueue.h +++ b/include/sys/bqueue.h @@ -30,22 +30,22 @@ typedef struct bqueue { kmutex_t bq_lock; kcondvar_t bq_add_cv; kcondvar_t bq_pop_cv; - uint64_t bq_size; - uint64_t bq_maxsize; - uint64_t bq_fill_fraction; + size_t bq_size; + size_t bq_maxsize; + uint_t bq_fill_fraction; size_t bq_node_offset; } bqueue_t; typedef struct bqueue_node { list_node_t bqn_node; - uint64_t bqn_size; + size_t bqn_size; } bqueue_node_t; -int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); +int bqueue_init(bqueue_t *, uint_t, size_t, size_t); void bqueue_destroy(bqueue_t *); -void bqueue_enqueue(bqueue_t *, void *, uint64_t); -void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); +void bqueue_enqueue(bqueue_t *, void *, size_t); +void bqueue_enqueue_flush(bqueue_t *, void *, size_t); void *bqueue_dequeue(bqueue_t *); boolean_t bqueue_empty(bqueue_t *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 8cbd0e6024a..dedee0e7bd5 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1758,9 +1758,9 @@ typedef enum { * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * The recordsize property can not be set larger than zfs_max_recordsize + * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near + * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c index 22539efc4e2..ec5ce4388ec 100644 --- a/module/zfs/bqueue.c +++ b/module/zfs/bqueue.c @@ -42,8 +42,7 @@ obj2node(bqueue_t *q, void *data) * Return 0 on success, or -1 on failure. */ int -bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, - size_t node_offset) +bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset) { if (fill_fraction == 0) { return (-1); @@ -78,22 +77,26 @@ bqueue_destroy(bqueue_t *q) } static void -bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, - boolean_t flush) +bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush) { ASSERT3U(item_size, >, 0); ASSERT3U(item_size, <=, q->bq_maxsize); mutex_enter(&q->bq_lock); obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { + while (q->bq_size && q->bq_size + item_size > q->bq_maxsize) { + /* + * Wake up bqueue_dequeue() thread if already sleeping in order + * to prevent the deadlock condition + */ + cv_signal(&q->bq_pop_cv); cv_wait_sig(&q->bq_add_cv, &q->bq_lock); } q->bq_size += item_size; list_insert_tail(&q->bq_list, data); - if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) - cv_signal(&q->bq_pop_cv); if (flush) cv_broadcast(&q->bq_pop_cv); + else if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) + cv_signal(&q->bq_pop_cv); mutex_exit(&q->bq_lock); } @@ -103,7 +106,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, * > 0. */ void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_FALSE); } @@ -117,7 +120,7 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) * destroy the condvar before the enqueuing thread is done. */ void -bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_TRUE); } @@ -130,7 +133,7 @@ void * bqueue_dequeue(bqueue_t *q) { void *ret = NULL; - uint64_t item_size; + size_t item_size; mutex_enter(&q->bq_lock); while (q->bq_size == 0) { cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); From 6c8e9f09c22446cb8a1415ed1db05231cd659f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Fri, 16 Sep 2022 22:59:25 +0200 Subject: [PATCH 49/69] =?UTF-8?q?Handle=20ECKSUM=20as=20new=20EZFS=5FCKSUM?= =?UTF-8?q?=20=E2=80=92=20"insufficient=20replicas"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a meaningful error message for ECKSUM to common error messages. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Closes #6805 Closes #13808 Closes #13898 --- include/libzfs.h | 1 + lib/libzfs/libzfs_util.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/libzfs.h b/include/libzfs.h index 92c7bf6d1c9..4fc77612259 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -151,6 +151,7 @@ typedef enum zfs_error { EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_VDEV_NOTSUP, /* ops not supported for this type of vdev */ EZFS_NOT_USER_NAMESPACE, /* a file is not a user namespace */ + EZFS_CKSUM, /* insufficient replicas */ EZFS_UNKNOWN } zfs_error_t; diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index cca86d2d782..3067e8d4639 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -170,6 +170,8 @@ libzfs_error_description(libzfs_handle_t *hdl) return (dgettext(TEXT_DOMAIN, "I/O error")); case EZFS_INTR: return (dgettext(TEXT_DOMAIN, "signal received")); + case EZFS_CKSUM: + return (dgettext(TEXT_DOMAIN, "insufficient replicas")); case EZFS_ISSPARE: return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " "spare")); @@ -396,6 +398,10 @@ zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, case EINTR: zfs_verror(hdl, EZFS_INTR, fmt, ap); return (-1); + + case ECKSUM: + zfs_verror(hdl, EZFS_CKSUM, fmt, ap); + return (-1); } return (0); From 4df8ccc83dc59c6921a4b8df4cd01f08ead3114a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 16 Sep 2022 17:02:54 -0400 Subject: [PATCH 50/69] Fix null pointer dereferences in PAM Coverity caught these. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13889 --- contrib/pam_zfs_key/pam_zfs_key.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 6f95d468074..c1001e6b81c 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -531,7 +531,6 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) if (zhp == NULL) { pam_syslog(NULL, LOG_ERR, "dataset %s not found", config->homes_prefix); - zfs_close(zhp); return (NULL); } @@ -543,6 +542,10 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) return (dsname); } + if (config->homes_prefix == NULL) { + return (NULL); + } + size_t len = ZFS_MAX_DATASET_NAME_LEN; size_t total_len = strlen(config->homes_prefix) + 1 + strlen(config->username); From 7dee043af5d9fce99611bca5863bf6ca28b741ba Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 16 Sep 2022 14:22:52 -0700 Subject: [PATCH 51/69] zfs_enter rework followup The zpl_fadvise() function was recently added and was not included in the initial patch. Update it accordingly. Signed-off-by: Brian Behlendorf Closes #13831 --- module/os/linux/zfs/zpl_file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index f6bdfd08b83..25fc6b22329 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -930,8 +930,8 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) if (offset < 0 || len < 0) return (-EINVAL); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); switch (advice) { case POSIX_FADV_SEQUENTIAL: @@ -963,7 +963,7 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) break; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } From 75e8b5ad847ed7fd9e40ffdf33989b6578469903 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 3 Aug 2022 18:36:41 +0200 Subject: [PATCH 52/69] Fix BLAKE3 tuneable and module loading on Linux and FreeBSD Apply similar options to BLAKE3 as it is done for zfs_fletcher_4_impl. The zfs module parameter on Linux changes from icp_blake3_impl to zfs_blake3_impl. You can check and set it on Linux via sysfs like this: ``` [bash]# cat /sys/module/zfs/parameters/zfs_blake3_impl cycle [fastest] generic sse2 sse41 avx2 [bash]# echo sse2 > /sys/module/zfs/parameters/zfs_blake3_impl [bash]# cat /sys/module/zfs/parameters/zfs_blake3_impl cycle fastest generic [sse2] sse41 avx2 ``` The modprobe module parameters may also be used now: ``` [bash]# modprobe zfs zfs_blake3_impl=sse41 [bash]# cat /sys/module/zfs/parameters/zfs_blake3_impl cycle fastest generic sse2 [sse41] avx2 ``` On FreeBSD the BLAKE3 implementation can be set via sysctl like this: ``` [bsd]# sysctl vfs.zfs.blake3_impl vfs.zfs.blake3_impl: cycle [fastest] generic sse2 sse41 avx2 [bsd]# sysctl vfs.zfs.blake3_impl=sse2 vfs.zfs.blake3_impl: cycle [fastest] generic sse2 sse41 avx2 \ -> cycle fastest generic [sse2] sse41 avx2 ``` This commit changes also some Blake3 internals like these: - blake3_impl_ops_t was renamed to blake3_ops_t - all functions are named blake3_impl_NAME() now Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Co-authored-by: Ryan Moeller Signed-off-by: Tino Reichardt Closes #13725 --- cmd/ztest.c | 4 +- include/os/freebsd/spl/sys/mod_os.h | 7 +- include/sys/blake3.h | 23 +- module/icp/algs/blake3/blake3.c | 14 +- module/icp/algs/blake3/blake3_generic.c | 2 +- module/icp/algs/blake3/blake3_impl.c | 366 +++++++++++++-------- module/icp/algs/blake3/blake3_impl.h | 14 +- module/icp/algs/blake3/blake3_x86-64.c | 8 +- module/zfs/zfs_chksum.c | 20 +- tests/zfs-tests/cmd/checksum/blake3_test.c | 12 +- 10 files changed, 273 insertions(+), 197 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 847c3a5b06c..0712f286bf6 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -6413,7 +6413,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) void *res2 = &zc_res2; /* BLAKE3_KEY_LEN = 32 */ - VERIFY0(blake3_set_impl_name("generic")); + VERIFY0(blake3_impl_setname("generic")); templ = abd_checksum_blake3_tmpl_init(&salt); Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); @@ -6422,7 +6422,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) ZIO_CHECKSUM_BSWAP(&zc_ref2); abd_checksum_blake3_tmpl_free(templ); - VERIFY0(blake3_set_impl_name("cycle")); + VERIFY0(blake3_impl_setname("cycle")); while (run_count-- > 0) { /* Test current implementation */ diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 95a19cc940c..e2815ce9e54 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -31,10 +31,6 @@ #include -#define EXPORT_SYMBOL(x) -#define module_param(a, b, c) -#define MODULE_PARM_DESC(a, b) - #define ZMOD_RW CTLFLAG_RWTUN #define ZMOD_RD CTLFLAG_RDTUN @@ -98,6 +94,9 @@ #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" +#define blake3_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, blake3_param, "A" + #include #define module_init(fn) \ static void \ diff --git a/include/sys/blake3.h b/include/sys/blake3.h index 19500585f38..ad65fc8db7b 100644 --- a/include/sys/blake3.h +++ b/include/sys/blake3.h @@ -72,7 +72,7 @@ typedef struct { */ uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; - /* const blake3_impl_ops_t *ops */ + /* const blake3_ops_t *ops */ const void *ops; } BLAKE3_CTX; @@ -97,26 +97,23 @@ extern void **blake3_per_cpu_ctx; extern void blake3_per_cpu_ctx_init(void); extern void blake3_per_cpu_ctx_fini(void); -/* return number of supported implementations */ -extern int blake3_get_impl_count(void); +/* get count of supported implementations */ +extern uint32_t blake3_impl_getcnt(void); -/* return id of selected implementation */ -extern int blake3_get_impl_id(void); +/* get id of selected implementation */ +extern uint32_t blake3_impl_getid(void); -/* return name of selected implementation */ -extern const char *blake3_get_impl_name(void); +/* get name of selected implementation */ +extern const char *blake3_impl_getname(void); /* setup id as fastest implementation */ -extern void blake3_set_impl_fastest(uint32_t id); +extern void blake3_impl_set_fastest(uint32_t id); /* set implementation by id */ -extern void blake3_set_impl_id(uint32_t id); +extern void blake3_impl_setid(uint32_t id); /* set implementation by name */ -extern int blake3_set_impl_name(const char *name); - -/* set startup implementation */ -extern void blake3_setup_impl(void); +extern int blake3_impl_setname(const char *name); #ifdef __cplusplus } diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c index b9600207b67..5f701859882 100644 --- a/module/icp/algs/blake3/blake3.c +++ b/module/icp/algs/blake3/blake3.c @@ -129,7 +129,7 @@ static output_t make_output(const uint32_t input_cv[8], * bytes. For that reason, chaining values in the CV stack are represented as * bytes. */ -static void output_chaining_value(const blake3_impl_ops_t *ops, +static void output_chaining_value(const blake3_ops_t *ops, const output_t *ctx, uint8_t cv[32]) { uint32_t cv_words[8]; @@ -139,7 +139,7 @@ static void output_chaining_value(const blake3_impl_ops_t *ops, store_cv_words(cv, cv_words); } -static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, +static void output_root_bytes(const blake3_ops_t *ops, const output_t *ctx, uint64_t seek, uint8_t *out, size_t out_len) { uint64_t output_block_counter = seek / 64; @@ -163,7 +163,7 @@ static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, } } -static void chunk_state_update(const blake3_impl_ops_t *ops, +static void chunk_state_update(const blake3_ops_t *ops, blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) { if (ctx->buf_len > 0) { @@ -230,7 +230,7 @@ static size_t left_len(size_t content_len) * number of chunks hashed. These chunks are never the root and never empty; * those cases use a different codepath. */ -static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, +static size_t compress_chunks_parallel(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { @@ -274,7 +274,7 @@ static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, * return it as an additional output.) These parents are never the root and * never empty; those cases use a different codepath. */ -static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, +static size_t compress_parents_parallel(const blake3_ops_t *ops, const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) { @@ -320,7 +320,7 @@ static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, * of implementing this special rule? Because we don't want to limit SIMD or * multi-threading parallelism for that update(). */ -static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, +static size_t blake3_compress_subtree_wide(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { @@ -406,7 +406,7 @@ static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, * As with compress_subtree_wide(), this function is not used on inputs of 1 * chunk or less. That's a different codepath. */ -static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops, +static void compress_subtree_to_parent_node(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c index 6c1eb33e89c..94a1f108236 100644 --- a/module/icp/algs/blake3/blake3_generic.c +++ b/module/icp/algs/blake3/blake3_generic.c @@ -192,7 +192,7 @@ static inline boolean_t blake3_is_generic_supported(void) return (B_TRUE); } -const blake3_impl_ops_t blake3_generic_impl = { +const blake3_ops_t blake3_generic_impl = { .compress_in_place = blake3_compress_in_place_generic, .compress_xof = blake3_compress_xof_generic, .hash_many = blake3_hash_many_generic, diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index 10741c82de7..5276fd88fbb 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -28,7 +28,7 @@ #include "blake3_impl.h" -static const blake3_impl_ops_t *const blake3_impls[] = { +static const blake3_ops_t *const blake3_impls[] = { &blake3_generic_impl, #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ @@ -48,160 +48,199 @@ static const blake3_impl_ops_t *const blake3_impls[] = { #endif }; -/* this pointer holds current ops for implementation */ -static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; - -/* special implementation selections */ +/* Select BLAKE3 implementation */ #define IMPL_FASTEST (UINT32_MAX) -#define IMPL_CYCLE (UINT32_MAX-1) -#define IMPL_USER (UINT32_MAX-2) -#define IMPL_PARAM (UINT32_MAX-3) +#define IMPL_CYCLE (UINT32_MAX - 1) -#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) -static uint32_t icp_blake3_impl = IMPL_FASTEST; +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) -#define BLAKE3_IMPL_NAME_MAX 16 +/* Indicate that benchmark has been done */ +static boolean_t blake3_initialized = B_FALSE; -/* id of fastest implementation */ -static uint32_t blake3_fastest_id = 0; +/* Implementation that contains the fastest methods */ +static blake3_ops_t blake3_fastest_impl = { + .name = "fastest" +}; -/* currently used id */ -static uint32_t blake3_current_id = 0; +/* Hold all supported implementations */ +static const blake3_ops_t *blake3_supp_impls[ARRAY_SIZE(blake3_impls)]; +static uint32_t blake3_supp_impls_cnt = 0; -/* id of module parameter (-1 == unused) */ -static int blake3_param_id = -1; +/* Currently selected implementation */ +static uint32_t blake3_impl_chosen = IMPL_FASTEST; -/* return number of supported implementations */ -int -blake3_get_impl_count(void) +static struct blake3_impl_selector { + const char *name; + uint32_t sel; +} blake3_impl_selectors[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST } +}; + +/* check the supported implementations */ +static void blake3_impl_init(void) { - static int impls = 0; - int i; + int i, c; - if (impls) - return (impls); + /* init only once */ + if (likely(blake3_initialized)) + return; - for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - impls++; + /* move supported implementations into blake3_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(blake3_impls); i++) { + const blake3_ops_t *impl = blake3_impls[i]; + + if (impl->is_supported && impl->is_supported()) + blake3_supp_impls[c++] = impl; } + blake3_supp_impls_cnt = c; - return (impls); + /* first init generic impl, may be changed via set_fastest() */ + memcpy(&blake3_fastest_impl, blake3_impls[0], + sizeof (blake3_fastest_impl)); + blake3_initialized = B_TRUE; } -/* return id of selected implementation */ -int -blake3_get_impl_id(void) +/* get number of supported implementations */ +uint32_t +blake3_impl_getcnt(void) { - return (blake3_current_id); + blake3_impl_init(); + return (blake3_supp_impls_cnt); } -/* return name of selected implementation */ +/* get id of selected implementation */ +uint32_t +blake3_impl_getid(void) +{ + return (IMPL_READ(blake3_impl_chosen)); +} + +/* get name of selected implementation */ const char * -blake3_get_impl_name(void) +blake3_impl_getname(void) { - return (blake3_selected_impl->name); + uint32_t impl = IMPL_READ(blake3_impl_chosen); + + blake3_impl_init(); + switch (impl) { + case IMPL_FASTEST: + return ("fastest"); + case IMPL_CYCLE: + return ("cycle"); + default: + return (blake3_supp_impls[impl]->name); + } } /* setup id as fastest implementation */ void -blake3_set_impl_fastest(uint32_t id) +blake3_impl_set_fastest(uint32_t id) { - blake3_fastest_id = id; + /* setup fastest impl */ + memcpy(&blake3_fastest_impl, blake3_supp_impls[id], + sizeof (blake3_fastest_impl)); } /* set implementation by id */ void -blake3_set_impl_id(uint32_t id) +blake3_impl_setid(uint32_t id) { - int i, cid; - - /* select fastest */ - if (id == IMPL_FASTEST) - id = blake3_fastest_id; - - /* select next or first */ - if (id == IMPL_CYCLE) - id = (++blake3_current_id) % blake3_get_impl_count(); - - /* 0..N for the real impl */ - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - if (cid == id) { - blake3_current_id = cid; - blake3_selected_impl = blake3_impls[i]; - return; - } - cid++; + blake3_impl_init(); + switch (id) { + case IMPL_FASTEST: + atomic_swap_32(&blake3_impl_chosen, IMPL_FASTEST); + break; + case IMPL_CYCLE: + atomic_swap_32(&blake3_impl_chosen, IMPL_CYCLE); + break; + default: + ASSERT3U(id, >=, 0); + ASSERT3U(id, <, blake3_supp_impls_cnt); + atomic_swap_32(&blake3_impl_chosen, id); + break; } } /* set implementation by name */ int -blake3_set_impl_name(const char *name) +blake3_impl_setname(const char *val) { - int i, cid; + uint32_t impl = IMPL_READ(blake3_impl_chosen); + size_t val_len; + int i, err = -EINVAL; - if (strcmp(name, "fastest") == 0) { - atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); - blake3_set_impl_id(IMPL_FASTEST); - return (0); - } else if (strcmp(name, "cycle") == 0) { - atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); - blake3_set_impl_id(IMPL_CYCLE); - return (0); - } + blake3_impl_init(); + val_len = strlen(val); + while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ + val_len--; - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - if (strcmp(name, blake3_impls[i]->name) == 0) { - if (icp_blake3_impl == IMPL_PARAM) { - blake3_param_id = cid; - return (0); - } - blake3_selected_impl = blake3_impls[i]; - blake3_current_id = cid; - return (0); + /* check mandatory implementations */ + for (i = 0; i < ARRAY_SIZE(blake3_impl_selectors); i++) { + const char *name = blake3_impl_selectors[i].name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = blake3_impl_selectors[i].sel; + err = 0; + break; } - cid++; } - return (-EINVAL); -} + if (err != 0 && blake3_initialized) { + /* check all supported implementations */ + for (i = 0; i < blake3_supp_impls_cnt; i++) { + const char *name = blake3_supp_impls[i]->name; -/* setup implementation */ -void -blake3_setup_impl(void) -{ - switch (IMPL_READ(icp_blake3_impl)) { - case IMPL_PARAM: - blake3_set_impl_id(blake3_param_id); - atomic_swap_32(&icp_blake3_impl, IMPL_USER); - break; - case IMPL_FASTEST: - blake3_set_impl_id(IMPL_FASTEST); - break; - case IMPL_CYCLE: - blake3_set_impl_id(IMPL_CYCLE); - break; - default: - blake3_set_impl_id(blake3_current_id); - break; + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = i; + err = 0; + break; + } + } } + + if (err == 0) { + atomic_swap_32(&blake3_impl_chosen, impl); + } + + return (err); } -/* return selected implementation */ -const blake3_impl_ops_t * +const blake3_ops_t * blake3_impl_get_ops(void) { - /* each call to ops will cycle */ - if (icp_blake3_impl == IMPL_CYCLE) - blake3_set_impl_id(IMPL_CYCLE); + const blake3_ops_t *ops = NULL; + uint32_t impl = IMPL_READ(blake3_impl_chosen); - return (blake3_selected_impl); + blake3_impl_init(); + switch (impl) { + case IMPL_FASTEST: + ASSERT(blake3_initialized); + ops = &blake3_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through supported implementations */ + ASSERT(blake3_initialized); + ASSERT3U(blake3_supp_impls_cnt, >, 0); + static uint32_t cycle_count = 0; + uint32_t idx = (++cycle_count) % blake3_supp_impls_cnt; + ops = blake3_supp_impls[idx]; + break; + default: + ASSERT3U(blake3_supp_impls_cnt, >, 0); + ASSERT3U(impl, <, blake3_supp_impls_cnt); + ops = blake3_supp_impls[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + return (ops); } #if defined(_KERNEL) + void **blake3_per_cpu_ctx; void @@ -215,6 +254,9 @@ blake3_per_cpu_ctx_init(void) blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX), KM_SLEEP); } + + /* init once in kernel mode */ + blake3_impl_init(); } void @@ -227,58 +269,94 @@ blake3_per_cpu_ctx_fini(void) memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *)); kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *)); } -#endif -#if defined(_KERNEL) && defined(__linux__) -static int -icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) -{ - char req_name[BLAKE3_IMPL_NAME_MAX]; - size_t i; +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") - /* sanitize input */ - i = strnlen(name, BLAKE3_IMPL_NAME_MAX); - if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) - return (-EINVAL); - - strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); - while (i > 0 && isspace(req_name[i-1])) - i--; - req_name[i] = '\0'; - - atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); - return (blake3_set_impl_name(req_name)); -} +#if defined(__linux__) static int -icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +blake3_param_get(char *buffer, zfs_kernel_param_t *unused) { - int i, cid, cnt = 0; + const uint32_t impl = IMPL_READ(blake3_impl_chosen); char *fmt; + int cnt = 0; /* cycling */ - fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; - cnt += sprintf(buffer + cnt, fmt); + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += sprintf(buffer + cnt, fmt, "cycle"); - /* fastest one */ - fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; - cnt += sprintf(buffer + cnt, fmt); + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += sprintf(buffer + cnt, fmt, "fastest"); - /* user selected */ - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - fmt = (icp_blake3_impl == IMPL_USER && - cid == blake3_current_id) ? "[%s] " : "%s "; - cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); - cid++; + /* list all supported implementations */ + for (uint32_t i = 0; i < blake3_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += sprintf(buffer + cnt, fmt, + blake3_supp_impls[i]->name); } - buffer[cnt] = 0; - return (cnt); } -module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, - NULL, 0644); -MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +static int +blake3_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (blake3_impl_setname(val)); +} + +#elif defined(__FreeBSD__) + +#include + +static int +blake3_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(blake3_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < blake3_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, blake3_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-blake3_impl_setname(buf)); +} +#endif + +#undef IMPL_FMT + +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl, + blake3_param_set, blake3_param_get, ZMOD_RW, \ + "Select BLAKE3 implementation."); #endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h index 5254061c737..eef74eaa909 100644 --- a/module/icp/algs/blake3/blake3_impl.h +++ b/module/icp/algs/blake3/blake3_impl.h @@ -62,31 +62,31 @@ typedef struct blake3_impl_ops { blake3_is_supported_f is_supported; int degree; const char *name; -} blake3_impl_ops_t; +} blake3_ops_t; /* Return selected BLAKE3 implementation ops */ -extern const blake3_impl_ops_t *blake3_impl_get_ops(void); +extern const blake3_ops_t *blake3_impl_get_ops(void); -extern const blake3_impl_ops_t blake3_generic_impl; +extern const blake3_ops_t blake3_generic_impl; #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_impl_ops_t blake3_sse2_impl; +extern const blake3_ops_t blake3_sse2_impl; #endif #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_impl_ops_t blake3_sse41_impl; +extern const blake3_ops_t blake3_sse41_impl; #endif #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern const blake3_impl_ops_t blake3_avx2_impl; +extern const blake3_ops_t blake3_avx2_impl; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern const blake3_impl_ops_t blake3_avx512_impl; +extern const blake3_ops_t blake3_avx512_impl; #endif #if defined(__x86_64) diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c index aecd29edb16..8139789fd77 100644 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -81,7 +81,7 @@ static boolean_t blake3_is_sse2_supported(void) #endif } -const blake3_impl_ops_t blake3_sse2_impl = { +const blake3_ops_t blake3_sse2_impl = { .compress_in_place = blake3_compress_in_place_sse2, .compress_xof = blake3_compress_xof_sse2, .hash_many = blake3_hash_many_sse2, @@ -147,7 +147,7 @@ static boolean_t blake3_is_sse41_supported(void) #endif } -const blake3_impl_ops_t blake3_sse41_impl = { +const blake3_ops_t blake3_sse41_impl = { .compress_in_place = blake3_compress_in_place_sse41, .compress_xof = blake3_compress_xof_sse41, .hash_many = blake3_hash_many_sse41, @@ -179,7 +179,7 @@ static boolean_t blake3_is_avx2_supported(void) zfs_avx2_available()); } -const blake3_impl_ops_t blake3_avx2_impl = { +const blake3_ops_t blake3_avx2_impl = { .compress_in_place = blake3_compress_in_place_sse41, .compress_xof = blake3_compress_xof_sse41, .hash_many = blake3_hash_many_avx2, @@ -237,7 +237,7 @@ static boolean_t blake3_is_avx512_supported(void) zfs_avx512vl_available()); } -const blake3_impl_ops_t blake3_avx512_impl = { +const blake3_ops_t blake3_avx512_impl = { .compress_in_place = blake3_compress_in_place_avx512, .compress_xof = blake3_compress_xof_avx512, .hash_many = blake3_hash_many_avx512, diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c index b9dc907afa8..74b4cb8d2e6 100644 --- a/module/zfs/zfs_chksum.c +++ b/module/zfs/zfs_chksum.c @@ -244,12 +244,13 @@ chksum_benchmark(void) #endif chksum_stat_t *cs; - int cbid = 0, id; + int cbid = 0; uint64_t max = 0; + uint32_t id, id_save; /* space for the benchmark times */ chksum_stat_cnt = 4; - chksum_stat_cnt += blake3_get_impl_count(); + chksum_stat_cnt += blake3_impl_getcnt(); chksum_stat_data = (chksum_stat_t *)kmem_zalloc( sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); @@ -290,20 +291,24 @@ chksum_benchmark(void) chksum_benchit(cs); /* blake3 */ - for (id = 0; id < blake3_get_impl_count(); id++) { - blake3_set_impl_id(id); + id_save = blake3_impl_getid(); + for (id = 0; id < blake3_impl_getcnt(); id++) { + blake3_impl_setid(id); cs = &chksum_stat_data[cbid++]; cs->init = abd_checksum_blake3_tmpl_init; cs->func = abd_checksum_blake3_native; cs->free = abd_checksum_blake3_tmpl_free; cs->name = "blake3"; - cs->impl = blake3_get_impl_name(); + cs->impl = blake3_impl_getname(); chksum_benchit(cs); if (cs->bs256k > max) { max = cs->bs256k; - blake3_set_impl_fastest(id); + blake3_impl_set_fastest(id); } } + + /* restore initial value */ + blake3_impl_setid(id_save); } void @@ -329,9 +334,6 @@ chksum_init(void) chksum_kstat_addr); kstat_install(chksum_kstat); } - - /* setup implementations */ - blake3_setup_impl(); } void diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c index d57d0e047f0..648e1faaaeb 100644 --- a/tests/zfs-tests/cmd/checksum/blake3_test.c +++ b/tests/zfs-tests/cmd/checksum/blake3_test.c @@ -497,9 +497,9 @@ main(int argc, char *argv[]) } (void) printf("Running algorithm correctness tests:\n"); - for (id = 0; id < blake3_get_impl_count(); id++) { - blake3_set_impl_id(id); - const char *name = blake3_get_impl_name(); + for (id = 0; id < blake3_impl_getcnt(); id++) { + blake3_impl_setid(id); + const char *name = blake3_impl_getname(); dprintf("Result for BLAKE3-%s:\n", name); for (i = 0; TestArray[i].hash; i++) { blake3_test_t *cur = &TestArray[i]; @@ -565,9 +565,9 @@ main(int argc, char *argv[]) } while (0) printf("Running performance tests (hashing 1024 MiB of data):\n"); - for (id = 0; id < blake3_get_impl_count(); id++) { - blake3_set_impl_id(id); - const char *name = blake3_get_impl_name(); + for (id = 0; id < blake3_impl_getcnt(); id++) { + blake3_impl_setid(id); + const char *name = blake3_impl_getname(); BLAKE3_PERF_TEST(name, 256); } From eeca9d27d7f6936f433fedd7a1d37233bdd670cd Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 3 Sep 2022 10:40:29 +0200 Subject: [PATCH 53/69] Add zfs_blake3_impl to zfs.4 The zfs module parameter zfs_blake3_impl got no manual page entry while adding BLAKE3 to OpenZFS. This commit adds the required notes about the parameter into zfs.4 Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Co-authored-by: Ryan Moeller Signed-off-by: Tino Reichardt Closes #13725 --- man/man4/zfs.4 | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index b2f3e7c61fb..90a8ca788c7 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1162,6 +1162,20 @@ Selecting any option other than results in vector instructions from the respective CPU instruction set being used. . +.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string +Select a BLAKE3 implementation. +.Pp +Supported selectors are: +.Sy cycle , fastest , generic , sse2 , sse41 , avx2 , avx512 . +All except +.Sy cycle , fastest No and Sy generic +require instruction set extensions to be available, +and will only appear if ZFS detects that they are present at runtime. +If multiple implementations of BLAKE3 are available, the +.Sy fastest will be chosen using a micro benchmark. You can see the +benchmark results by reading this kstat file: +.Pa /proc/spl/kstat/zfs/chksum_bench . +. .It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable the processing of the free_bpobj object. . From 48cf170d5a9f6610db0f576238e054e727239e82 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 7 Sep 2022 20:33:59 +0200 Subject: [PATCH 54/69] Add PPC cpu feature tests for FreeBSD and Linux Add needed cpu feature tests for powerpc architecture. Overview: zfs_altivec_available() - needed by RAID-Z zfs_vsx_available() - needed by BLAKE3 zfs_isa207_available() - needed by SHA2 Part 1 - Userspace - use getauxval() for Linux and elf_aux_info() for FreeBSD - direct including fails with double definitions - so we self define the needed functions and definitions Part 2 - Kernel space FreeBSD - use exported cpu_features of Part 3 - Kernel space Linux - use cpu_has_feature() function of Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Tino Reichardt Closes #13725 --- include/os/freebsd/Makefile.am | 1 + include/os/freebsd/spl/sys/simd.h | 8 +- include/os/freebsd/spl/sys/simd_powerpc.h | 90 ++++++++++++++++++++ include/os/freebsd/spl/sys/simd_x86.h | 50 +++++------ include/os/linux/kernel/linux/simd_powerpc.h | 83 +++++++++--------- lib/libspl/include/sys/simd.h | 89 ++++++++++--------- 6 files changed, 204 insertions(+), 117 deletions(-) create mode 100644 include/os/freebsd/spl/sys/simd_powerpc.h diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index 5ddb7cd710b..3796f20ae7e 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -50,6 +50,7 @@ noinst_HEADERS = \ %D%/spl/sys/sid.h \ %D%/spl/sys/sig.h \ %D%/spl/sys/simd.h \ + %D%/spl/sys/simd_powerpc.h \ %D%/spl/sys/simd_x86.h \ %D%/spl/sys/spl_condvar.h \ %D%/spl/sys/string.h \ diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h index 53503e83891..3106e4853c7 100644 --- a/include/os/freebsd/spl/sys/simd.h +++ b/include/os/freebsd/spl/sys/simd.h @@ -26,13 +26,16 @@ * $FreeBSD$ */ - #ifndef _FREEBSD_SIMD_H #define _FREEBSD_SIMD_H + #if defined(__amd64__) || defined(__i386__) #include -#else +#elif defined(__powerpc__) +#include + +#else #define kfpu_allowed() 0 #define kfpu_initialize(tsk) do {} while (0) #define kfpu_begin() do {} while (0) @@ -40,4 +43,5 @@ #define kfpu_init() (0) #define kfpu_fini() do {} while (0) #endif + #endif diff --git a/include/os/freebsd/spl/sys/simd_powerpc.h b/include/os/freebsd/spl/sys/simd_powerpc.h new file mode 100644 index 00000000000..b90240580c7 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd_powerpc.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2022 Tino Reichardt + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_altivec_available() + * zfs_vsx_available() + * zfs_isa207_available() + */ + +#ifndef _FREEBSD_SIMD_POWERPC_H +#define _FREEBSD_SIMD_POWERPC_H + +#include +#include + +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +/* + * Check if Altivec is available + */ +static inline boolean_t +zfs_altivec_available(void) +{ + return ((cpu_features & PPC_FEATURE_HAS_ALTIVEC) != 0); +} + +/* + * Check if VSX is available + */ +static inline boolean_t +zfs_vsx_available(void) +{ + return ((cpu_features & PPC_FEATURE_HAS_VSX) != 0); +} + +/* + * Check if POWER ISA 2.07 is available (SHA2) + */ +static inline boolean_t +zfs_isa207_available(void) +{ + return ((cpu_features2 & PPC_FEATURE2_ARCH_2_07) != 0); +} diff --git a/include/os/freebsd/spl/sys/simd_x86.h b/include/os/freebsd/spl/sys/simd_x86.h index 480bfd28973..7a0ca243f76 100644 --- a/include/os/freebsd/spl/sys/simd_x86.h +++ b/include/os/freebsd/spl/sys/simd_x86.h @@ -77,7 +77,7 @@ __simd_state_enabled(const uint64_t state) boolean_t has_osxsave; uint64_t xcr0; - has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE); + has_osxsave = (cpu_feature2 & CPUID2_OSXSAVE) != 0; if (!has_osxsave) return (B_FALSE); @@ -99,7 +99,7 @@ __simd_state_enabled(const uint64_t state) static inline boolean_t zfs_sse_available(void) { - return (!!(cpu_feature & CPUID_SSE)); + return ((cpu_feature & CPUID_SSE) != 0); } /* @@ -108,7 +108,7 @@ zfs_sse_available(void) static inline boolean_t zfs_sse2_available(void) { - return (!!(cpu_feature & CPUID_SSE2)); + return ((cpu_feature & CPUID_SSE2) != 0); } /* @@ -117,7 +117,7 @@ zfs_sse2_available(void) static inline boolean_t zfs_sse3_available(void) { - return (!!(cpu_feature2 & CPUID2_SSE3)); + return ((cpu_feature2 & CPUID2_SSE3) != 0); } /* @@ -126,7 +126,7 @@ zfs_sse3_available(void) static inline boolean_t zfs_ssse3_available(void) { - return (!!(cpu_feature2 & CPUID2_SSSE3)); + return ((cpu_feature2 & CPUID2_SSSE3) != 0); } /* @@ -135,7 +135,7 @@ zfs_ssse3_available(void) static inline boolean_t zfs_sse4_1_available(void) { - return (!!(cpu_feature2 & CPUID2_SSE41)); + return ((cpu_feature2 & CPUID2_SSE41) != 0); } /* @@ -144,7 +144,7 @@ zfs_sse4_1_available(void) static inline boolean_t zfs_sse4_2_available(void) { - return (!!(cpu_feature2 & CPUID2_SSE42)); + return ((cpu_feature2 & CPUID2_SSE42) != 0); } /* @@ -155,7 +155,7 @@ zfs_avx_available(void) { boolean_t has_avx; - has_avx = !!(cpu_feature2 & CPUID2_AVX); + has_avx = (cpu_feature2 & CPUID2_AVX) != 0; return (has_avx && __ymm_enabled()); } @@ -168,7 +168,7 @@ zfs_avx2_available(void) { boolean_t has_avx2; - has_avx2 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX2); + has_avx2 = (cpu_stdext_feature & CPUID_STDEXT_AVX2) != 0; return (has_avx2 && __ymm_enabled()); } @@ -196,7 +196,7 @@ zfs_avx512f_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0; return (has_avx512 && __zmm_enabled()); } @@ -207,8 +207,8 @@ zfs_avx512cd_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_AVX512CD) != 0; return (has_avx512 && __zmm_enabled()); } @@ -219,8 +219,8 @@ zfs_avx512er_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_AVX512CD) != 0; return (has_avx512 && __zmm_enabled()); } @@ -231,8 +231,8 @@ zfs_avx512pf_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512PF); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_AVX512PF) != 0; return (has_avx512 && __zmm_enabled()); } @@ -243,7 +243,7 @@ zfs_avx512bw_available(void) { boolean_t has_avx512 = B_FALSE; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512BW); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512BW) != 0; return (has_avx512 && __zmm_enabled()); } @@ -254,8 +254,8 @@ zfs_avx512dq_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512DQ); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0; return (has_avx512 && __zmm_enabled()); } @@ -266,8 +266,8 @@ zfs_avx512vl_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_AVX512VL) != 0; return (has_avx512 && __zmm_enabled()); } @@ -278,8 +278,8 @@ zfs_avx512ifma_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_AVX512IFMA); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_AVX512IFMA) != 0; return (has_avx512 && __zmm_enabled()); } @@ -290,8 +290,8 @@ zfs_avx512vbmi_available(void) { boolean_t has_avx512; - has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && - !!(cpu_stdext_feature & CPUID_STDEXT_BMI1); + has_avx512 = (cpu_stdext_feature & CPUID_STDEXT_AVX512F) != 0 && + (cpu_stdext_feature & CPUID_STDEXT_BMI1) != 0; return (has_avx512 && __zmm_enabled()); } diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h index 764c5dc51f9..2a2f92bc499 100644 --- a/include/os/linux/kernel/linux/simd_powerpc.h +++ b/include/os/linux/kernel/linux/simd_powerpc.h @@ -21,6 +21,7 @@ /* * Copyright (C) 2019 Romain Dolbeau * + * Copyright (C) 2022 Tino Reichardt */ /* @@ -41,7 +42,9 @@ * all relevant feature test functions should be called. * * Supported features: - * zfs_altivec_available() + * zfs_altivec_available() + * zfs_vsx_available() + * zfs_isa207_available() */ #ifndef _LINUX_SIMD_POWERPC_H @@ -57,73 +60,65 @@ #include #include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) +#include +#else +#include +#endif + #define kfpu_allowed() 1 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) -#define kfpu_end() \ - { \ - disable_kernel_vsx(); \ - disable_kernel_altivec(); \ - preempt_enable(); \ - } #define kfpu_begin() \ { \ preempt_disable(); \ enable_kernel_altivec(); \ enable_kernel_vsx(); \ + enable_kernel_spe(); \ + } +#define kfpu_end() \ + { \ + disable_kernel_spe(); \ + disable_kernel_vsx(); \ + disable_kernel_altivec(); \ + preempt_enable(); \ } #else /* seems that before 4.5 no-one bothered */ #define kfpu_begin() #define kfpu_end() preempt_enable() #endif + #define kfpu_init() 0 #define kfpu_fini() ((void) 0) -static inline boolean_t -zfs_vsx_available(void) -{ - boolean_t res; -#if defined(__powerpc64__) - u64 msr; -#else - u32 msr; -#endif - kfpu_begin(); - __asm volatile("mfmsr %0" : "=r"(msr)); - res = (msr & 0x800000) != 0; - kfpu_end(); - return (res); -} - /* * Check if AltiVec instruction set is available */ static inline boolean_t zfs_altivec_available(void) { - boolean_t res; - /* suggested by macallan at netbsd dot org */ -#if defined(__powerpc64__) - u64 msr; -#else - u32 msr; -#endif - kfpu_begin(); - __asm volatile("mfmsr %0" : "=r"(msr)); - /* - * 64 bits -> need to check bit 38 - * Power ISA Version 3.0B - * p944 - * 32 bits -> Need to check bit 6 - * AltiVec Technology Programming Environments Manual - * p49 (2-9) - * They are the same, as ppc counts 'backward' ... - */ - res = (msr & 0x2000000) != 0; - kfpu_end(); - return (res); + return (cpu_has_feature(CPU_FTR_ALTIVEC)); } + +/* + * Check if VSX is available + */ +static inline boolean_t +zfs_vsx_available(void) +{ + return (cpu_has_feature(CPU_FTR_VSX)); +} + +/* + * Check if POWER ISA 2.07 is available (SHA2) + */ +static inline boolean_t +zfs_isa207_available(void) +{ + return (cpu_has_feature(CPU_FTR_ARCH_207S)); +} + #endif /* defined(__powerpc) */ #endif /* _LINUX_SIMD_POWERPC_H */ diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h index c9d86a0808f..c0099dd7919 100644 --- a/lib/libspl/include/sys/simd.h +++ b/lib/libspl/include/sys/simd.h @@ -20,8 +20,8 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2022 Tino Reichardt */ #ifndef _LIBSPL_SYS_SIMD_H @@ -452,63 +452,60 @@ zfs_avx512vbmi_available(void) #elif defined(__powerpc__) +/* including clashes with AT_UID and others */ +extern unsigned long getauxval(unsigned long type); +#if defined(__FreeBSD__) +#define AT_HWCAP 25 /* CPU feature flags. */ +#define AT_HWCAP2 26 /* CPU feature flags 2. */ +extern int elf_aux_info(int aux, void *buf, int buflen); +static unsigned long getauxval(unsigned long key) +{ + unsigned long val = 0UL; + + if (elf_aux_info((int)key, &val, sizeof (val)) != 0) + return (0UL); + + return (val); +} +#elif defined(__linux__) +#define AT_HWCAP 16 /* CPU feature flags. */ +#define AT_HWCAP2 26 /* CPU feature flags 2. */ +#endif + #define kfpu_allowed() 1 #define kfpu_initialize(tsk) do {} while (0) #define kfpu_begin() do {} while (0) #define kfpu_end() do {} while (0) -/* - * Check if AltiVec instruction set is available - * No easy way beyond 'altivec works' :-( - */ -#include -#include - -#if defined(__ALTIVEC__) && !defined(__FreeBSD__) -static jmp_buf env; -static void sigillhandler(int x) -{ - (void) x; - longjmp(env, 1); -} -#endif - +#define PPC_FEATURE_HAS_ALTIVEC 0x10000000 static inline boolean_t zfs_altivec_available(void) { - boolean_t has_altivec = B_FALSE; -#if defined(__ALTIVEC__) && !defined(__FreeBSD__) - sighandler_t savesig; - savesig = signal(SIGILL, sigillhandler); - if (setjmp(env)) { - signal(SIGILL, savesig); - has_altivec = B_FALSE; - } else { - __asm__ __volatile__("vor 0,0,0\n" : : : "v0"); - signal(SIGILL, savesig); - has_altivec = B_TRUE; - } -#endif - return (has_altivec); + unsigned long hwcap = getauxval(AT_HWCAP); + + return (hwcap & PPC_FEATURE_HAS_ALTIVEC); } + +#define PPC_FEATURE_HAS_VSX 0x00000080 static inline boolean_t zfs_vsx_available(void) { - boolean_t has_vsx = B_FALSE; -#if defined(__ALTIVEC__) && !defined(__FreeBSD__) - sighandler_t savesig; - savesig = signal(SIGILL, sigillhandler); - if (setjmp(env)) { - signal(SIGILL, savesig); - has_vsx = B_FALSE; - } else { - __asm__ __volatile__("xssubsp 0,0,0\n"); - signal(SIGILL, savesig); - has_vsx = B_TRUE; - } -#endif - return (has_vsx); + unsigned long hwcap = getauxval(AT_HWCAP); + + return (hwcap & PPC_FEATURE_HAS_VSX); } + +#define PPC_FEATURE2_ARCH_2_07 0x80000000 +static inline boolean_t +zfs_isa207_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + + return ((hwcap & PPC_FEATURE_HAS_VSX) && + (hwcap2 & PPC_FEATURE2_ARCH_2_07)); +} + #else #define kfpu_allowed() 0 From 9a671fe7ecbc5f6ca07d96869207720a37b088e4 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 20 Sep 2022 02:17:27 +0200 Subject: [PATCH 55/69] FreeBSD: stop passing LK_INTERLOCK to VOP_LOCK There is an ongoing effort to eliminate this feature. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Mateusz Guzik Closes #13908 --- module/os/freebsd/zfs/zfs_ctldir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 4b95b49dc40..42bb7551e9c 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -977,12 +977,13 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap) */ VI_LOCK(*vpp); if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + VI_UNLOCK(*vpp); /* * Upgrade to exclusive lock in order to: * - avoid race conditions * - satisfy the contract of mount_snapshot() */ - err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + err = VOP_LOCK(*vpp, LK_TRYUPGRADE); if (err == 0) break; } else { From 042d43a1ddf114ea72d83fd45cc926724f74f5fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Tue, 20 Sep 2022 02:21:45 +0200 Subject: [PATCH 56/69] FreeBSD: fix static module build broken in 7bb707ffa param_set_arc_free_target(SYSCTL_HANDLER_ARGS) and param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) defined in sysctl_os.c must be made available to arc_os.c. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Martin Matuska Closes #13915 --- include/os/freebsd/zfs/sys/arc_os.h | 34 +++++++++++++++++++++++++++++ module/os/freebsd/zfs/arc_os.c | 3 +-- module/os/freebsd/zfs/sysctl_os.c | 5 +++-- 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 include/os/freebsd/zfs/sys/arc_os.h diff --git a/include/os/freebsd/zfs/sys/arc_os.h b/include/os/freebsd/zfs/sys/arc_os.h new file mode 100644 index 00000000000..a95618b91fe --- /dev/null +++ b/include/os/freebsd/zfs/sys/arc_os.h @@ -0,0 +1,34 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Martin Matuska + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_ARC_OS_H +#define _SYS_ARC_OS_H + +int param_set_arc_free_target(SYSCTL_HANDLER_ARGS); +int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); + +#endif diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index f1a3a0fafa9..30e96a889e0 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -72,11 +73,9 @@ SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ -int param_set_arc_free_target(SYSCTL_HANDLER_ARGS); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target, param_set_arc_free_target, 0, CTLFLAG_RW, "Desired number of free pages below which ARC triggers reclaim"); -int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift, param_set_arc_no_grow_shift, 0, ZMOD_RW, "log2(fraction of ARC which must be free to allow growing)"); diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index cd384c205df..4d908381c40 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -228,7 +229,7 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, extern uint_t zfs_arc_free_target; -static int +int param_set_arc_free_target(SYSCTL_HANDLER_ARGS) { uint_t val; @@ -261,7 +262,7 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, " (LEGACY)"); /* END CSTYLED */ -static int +int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { int err, val; From 891ac937beb959cad94a2ba267e4b56dee930a5e Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 19 Sep 2022 20:30:58 -0400 Subject: [PATCH 57/69] Linux: Fix use-after-free in zfsvfs_create() Coverity reported that we pass a pointer to zfsvfs to `dmu_objset_disown()` after freeing zfsvfs in zfsvfs_create_impl() after a failure in zfsvfs_init(). We have nearly identical duplicate versions of this code for FreeBSD and Linux, but interestingly, the FreeBSD version of this code differs in such a way that it does not suffer from this bug. We remove the difference from the FreeBSD version to fix this bug. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13883 --- module/os/linux/zfs/zfs_vfsops.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 251d9e9a40f..64d6b4616e1 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -784,9 +784,7 @@ zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) } error = zfsvfs_create_impl(zfvp, zfsvfs, os); - if (error != 0) { - dmu_objset_disown(os, B_TRUE, zfsvfs); - } + return (error); } @@ -826,6 +824,7 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) error = zfsvfs_init(zfsvfs, os); if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; zfsvfs_free(zfsvfs); return (error); From f272960d52bdc5689078d3cb7cd9e0233cd1a8cd Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 19 Sep 2022 20:32:18 -0400 Subject: [PATCH 58/69] Fix usage of zed_log_msg() and zfs_panic_recover() Coverity complained about the format specifiers not matching variables. In one case, the variable is a constant, so we fix it. In another, we were missing an argument (about which coverity also complained). Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Richard Yao Closes #13888 --- cmd/zed/agents/fmd_api.c | 2 +- cmd/zed/agents/zfs_mod.c | 2 +- cmd/zed/zed_conf.c | 4 ++-- cmd/zed/zed_disk_event.c | 2 +- cmd/zed/zed_exec.c | 2 +- module/zfs/spa_checkpoint.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c index 9e46e831d51..56c134b731b 100644 --- a/cmd/zed/agents/fmd_api.c +++ b/cmd/zed/agents/fmd_api.c @@ -372,7 +372,7 @@ zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) if (code != NULL) zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) - zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + zed_log_msg(LOG_INFO, "\t%s: %hhu", FM_FAULT_CERTAINTY, byte); if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 7364dd2c628..af6de73a1cc 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -364,7 +364,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) (vs->vs_state != VDEV_STATE_FAULTED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " - "a bad state (currently %d)", vs->vs_state); + "a bad state (currently %llu)", vs->vs_state); return; } diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c index 9a39d1a8098..29de27c77c3 100644 --- a/cmd/zed/zed_conf.c +++ b/cmd/zed/zed_conf.c @@ -657,7 +657,7 @@ zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]) } else if (n != len) { errno = EIO; zed_log_msg(LOG_WARNING, - "Failed to read state file \"%s\": Read %d of %d bytes", + "Failed to read state file \"%s\": Read %zd of %zd bytes", zcp->state_file, n, len); return (-1); } @@ -706,7 +706,7 @@ zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]) if (n != len) { errno = EIO; zed_log_msg(LOG_WARNING, - "Failed to write state file \"%s\": Wrote %d of %d bytes", + "Failed to write state file \"%s\": Wrote %zd of %zd bytes", zcp->state_file, n, len); return (-1); } diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 3c8e2fb38c1..db89ecc907b 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -49,7 +49,7 @@ struct udev_monitor *g_mon; #define DEV_BYID_PATH "/dev/disk/by-id/" /* 64MB is minimum usable disk for ZFS */ -#define MINIMUM_SECTORS 131072 +#define MINIMUM_SECTORS 131072ULL /* diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c index 369c4b6950c..51c292d41cc 100644 --- a/cmd/zed/zed_exec.c +++ b/cmd/zed/zed_exec.c @@ -263,7 +263,7 @@ _reap_children(void *arg) zed_log_msg(LOG_INFO, "Finished \"%s\" eid=%llu pid=%d " "time=%llu.%06us status=0x%X", - node.name, node.eid, + node.name, node.eid, pid, (unsigned long long) usage.ru_utime.tv_sec, (unsigned int) usage.ru_utime.tv_usec, (unsigned int) status); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index b5b1dfa8a08..a837b1ce97e 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) if (error != 0) { zfs_panic_recover("zfs: error %lld was returned " "while incrementally destroying the checkpoint " - "space map of vdev %u\n", + "space map of vdev %llu\n", (longlong_t)error, vd->vdev_id); } ASSERT0(words_after); From e8bdc74528c2d0a97e324051e74aeda2e501d1d0 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 19 Sep 2022 20:33:52 -0400 Subject: [PATCH 59/69] Cleanup: Remove unused uu_pname code Coverity caught a possible NULL pointer dereference in dead code. We can delete it all. Reviewed-by: Brian Behlendorf Reviewed-by: Chunwei Chen Signed-off-by: Richard Yao Closes #13900 --- include/libuutil.h | 33 ------- lib/libuutil/Makefile.am | 1 - lib/libuutil/libuutil.abi | 73 -------------- lib/libuutil/uu_pname.c | 202 -------------------------------------- 4 files changed, 309 deletions(-) delete mode 100644 lib/libuutil/uu_pname.c diff --git a/include/libuutil.h b/include/libuutil.h index cb3d366c476..906b49ea5ca 100644 --- a/include/libuutil.h +++ b/include/libuutil.h @@ -56,13 +56,6 @@ extern "C" { #define UU_ERROR_SYSTEM 99 /* underlying system error */ #define UU_ERROR_UNKNOWN 100 /* error status not known */ -/* - * Standard program exit codes. - */ -#define UU_EXIT_OK (*(uu_exit_ok())) -#define UU_EXIT_FATAL (*(uu_exit_fatal())) -#define UU_EXIT_USAGE (*(uu_exit_usage())) - /* * Exit status profiles. */ @@ -75,32 +68,6 @@ extern "C" { uint32_t uu_error(void); const char *uu_strerror(uint32_t); -/* - * Program notification functions. - */ -extern void uu_alt_exit(int); -extern const char *uu_setpname(char *); -extern const char *uu_getpname(void); -extern void uu_warn(const char *, ...) - __attribute__((format(printf, 1, 2))); -extern void uu_vwarn(const char *, va_list) - __attribute__((format(printf, 1, 0))); -extern __attribute__((noreturn)) void uu_die(const char *, ...) - __attribute__((format(printf, 1, 2))); -extern __attribute__((noreturn)) void uu_vdie(const char *, va_list) - __attribute__((format(printf, 1, 0))); -extern __attribute__((noreturn)) void uu_xdie(int, const char *, ...) - __attribute__((format(printf, 2, 3))); -extern __attribute__((noreturn)) void uu_vxdie(int, const char *, va_list) - __attribute__((format(printf, 2, 0))); - -/* - * Exit status functions (not to be used directly) - */ -extern int *uu_exit_ok(void); -extern int *uu_exit_fatal(void); -extern int *uu_exit_usage(void); - /* * Identifier test flags and function. */ diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am index 339f9a06474..b973ce3cca4 100644 --- a/lib/libuutil/Makefile.am +++ b/lib/libuutil/Makefile.am @@ -9,7 +9,6 @@ libuutil_la_SOURCES = \ %D%/uu_ident.c \ %D%/uu_list.c \ %D%/uu_misc.c \ - %D%/uu_pname.c \ %D%/uu_string.c libuutil_la_LIBADD = \ diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index 766d8843000..f5186a0837a 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -1744,79 +1744,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/lib/libuutil/uu_pname.c b/lib/libuutil/uu_pname.c deleted file mode 100644 index 37c093731ef..00000000000 --- a/lib/libuutil/uu_pname.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - - -#include "libuutil_common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const char *pname; - -static __attribute__((noreturn)) void -uu_die_internal(int status, const char *format, va_list alist); - -int uu_exit_ok_value = EXIT_SUCCESS; -int uu_exit_fatal_value = EXIT_FAILURE; -int uu_exit_usage_value = 2; - -int * -uu_exit_ok(void) -{ - return (&uu_exit_ok_value); -} - -int * -uu_exit_fatal(void) -{ - return (&uu_exit_fatal_value); -} - -int * -uu_exit_usage(void) -{ - return (&uu_exit_usage_value); -} - -void -uu_alt_exit(int profile) -{ - switch (profile) { - case UU_PROFILE_DEFAULT: - uu_exit_ok_value = EXIT_SUCCESS; - uu_exit_fatal_value = EXIT_FAILURE; - uu_exit_usage_value = 2; - break; - case UU_PROFILE_LAUNCHER: - uu_exit_ok_value = EXIT_SUCCESS; - uu_exit_fatal_value = 124; - uu_exit_usage_value = 125; - break; - } -} - -static __attribute__((format(printf, 2, 0))) void -uu_warn_internal(int err, const char *format, va_list alist) -{ - if (pname != NULL) - (void) fprintf(stderr, "%s: ", pname); - - if (format != NULL) - (void) vfprintf(stderr, format, alist); - - if (strrchr(format, '\n') == NULL) - (void) fprintf(stderr, ": %s\n", strerror(err)); -} - -void -uu_vwarn(const char *format, va_list alist) -{ - uu_warn_internal(errno, format, alist); -} - -void -uu_warn(const char *format, ...) -{ - va_list alist; - va_start(alist, format); - uu_warn_internal(errno, format, alist); - va_end(alist); -} - -static __attribute__((format(printf, 2, 0))) __attribute__((noreturn)) void -uu_die_internal(int status, const char *format, va_list alist) -{ - uu_warn_internal(errno, format, alist); -#ifdef DEBUG - { - char *cp; - - if (!issetugid()) { - cp = getenv("UU_DIE_ABORTS"); - if (cp != NULL && *cp != '\0') - abort(); - } - } -#endif - exit(status); -} - -void -uu_vdie(const char *format, va_list alist) -{ - uu_die_internal(UU_EXIT_FATAL, format, alist); -} - -void -uu_die(const char *format, ...) -{ - va_list alist; - va_start(alist, format); - uu_die_internal(UU_EXIT_FATAL, format, alist); - va_end(alist); -} - -void -uu_vxdie(int status, const char *format, va_list alist) -{ - uu_die_internal(status, format, alist); -} - -void -uu_xdie(int status, const char *format, ...) -{ - va_list alist; - va_start(alist, format); - uu_die_internal(status, format, alist); - va_end(alist); -} - -const char * -uu_setpname(char *arg0) -{ - /* - * Having a NULL argv[0], while uncommon, is possible. It - * makes more sense to handle this event in uu_setpname rather - * than in each of its consumers. - */ - if (arg0 == NULL) { - pname = getexecname(); - if (pname == NULL) - pname = "unknown_command"; - return (pname); - } - - /* - * Guard against '/' at end of command invocation. - */ - for (;;) { - char *p = strrchr(arg0, '/'); - if (p == NULL) { - pname = arg0; - break; - } else { - if (*(p + 1) == '\0') { - *p = '\0'; - continue; - } - - pname = p + 1; - break; - } - } - - return (pname); -} - -const char * -uu_getpname(void) -{ - return (pname); -} From 9276e202eba330baf253ff7b0f7a631d5915c116 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 20 Sep 2022 17:43:03 -0400 Subject: [PATCH 60/69] FreeBSD: Fix uninitialized pointer read in spa_import_rootpool() The FreeBSD project's coverity scans found this. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Richard Yao Closes #13923 --- module/os/freebsd/zfs/spa_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c index 9bc61a6c8fe..45ea10bb487 100644 --- a/module/os/freebsd/zfs/spa_os.c +++ b/module/os/freebsd/zfs/spa_os.c @@ -249,7 +249,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind) mutex_exit(&spa_namespace_lock); fnvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", - pname); + name); return (error); } From 3f400b0f5851e3e5e4f90dd8f5fe083acdeb7ea6 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 20 Sep 2022 17:50:16 -0400 Subject: [PATCH 61/69] FreeBSD: Cleanup zfs_readdir() The FreeBSD project's coverity scans found dead code in `zfs_readdir()`. Also, the comment above `zfs_readdir()` is out of date. I fixed the comment and deleted all of the dead code, plus additional dead code that was found upon review. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Richard Yao Closes #13924 --- include/os/freebsd/Makefile.am | 1 - include/os/freebsd/spl/sys/extdirent.h | 71 ------------------------- include/os/freebsd/spl/sys/vnode_impl.h | 13 ----- module/os/freebsd/zfs/zfs_dir.c | 1 - module/os/freebsd/zfs/zfs_vnops_os.c | 68 ++++++----------------- 5 files changed, 16 insertions(+), 138 deletions(-) delete mode 100644 include/os/freebsd/spl/sys/extdirent.h diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index 3796f20ae7e..a750f52e7d2 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -21,7 +21,6 @@ noinst_HEADERS = \ %D%/spl/sys/dirent.h \ %D%/spl/sys/disp.h \ %D%/spl/sys/dkio.h \ - %D%/spl/sys/extdirent.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ %D%/spl/sys/freebsd_rwlock.h \ diff --git a/include/os/freebsd/spl/sys/extdirent.h b/include/os/freebsd/spl/sys/extdirent.h deleted file mode 100644 index d6927ae40bb..00000000000 --- a/include/os/freebsd/spl/sys/extdirent.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_EXTDIRENT_H -#define _SYS_EXTDIRENT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -/* - * Extended file-system independent directory entry. This style of - * dirent provides additional informational flag bits for each - * directory entry. This dirent will be returned instead of the - * standard dirent if a VOP_READDIR() requests dirent flags via - * V_RDDIR_ENTFLAGS, and if the file system supports the flags. - */ -typedef struct edirent { - ino64_t ed_ino; /* "inode number" of entry */ - off64_t ed_off; /* offset of disk directory entry */ - uint32_t ed_eflags; /* per-entry flags */ - unsigned short ed_reclen; /* length of this record */ - char ed_name[1]; /* name of file */ -} edirent_t; - -#define EDIRENT_RECLEN(namelen) \ - ((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7) -#define EDIRENT_NAMELEN(reclen) \ - ((reclen) - (offsetof(edirent_t, ed_name[0]))) - -/* - * Extended entry flags - * Extended entries include a bitfield of extra information - * regarding that entry. - */ -#define ED_CASE_CONFLICT 0x10 /* Disconsidering case, entry is not unique */ - -/* - * Extended flags accessor function - */ -#define ED_CASE_CONFLICTS(x) ((x)->ed_eflags & ED_CASE_CONFLICT) -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_EXTDIRENT_H */ diff --git a/include/os/freebsd/spl/sys/vnode_impl.h b/include/os/freebsd/spl/sys/vnode_impl.h index 3e698d7ac92..4e04b5e80a0 100644 --- a/include/os/freebsd/spl/sys/vnode_impl.h +++ b/include/os/freebsd/spl/sys/vnode_impl.h @@ -44,8 +44,6 @@ #define IS_DEVVP(vp) \ ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) -#define V_XATTRDIR 0x0000 /* attribute unnamed directory */ - #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ /* @@ -193,11 +191,6 @@ #define MODEMASK 07777 /* mode bits plus permission bits */ #define PERMMASK 00777 /* permission bits */ -/* - * VOP_ACCESS flags - */ -#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ - /* * Flags for vnode operations. */ @@ -234,12 +227,6 @@ struct taskq; #define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ #define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ -/* - * Flags for VOP_READDIR - */ -#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ -#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ - /* * Public vnode manipulation functions. */ diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c index 6321f0b532a..778e4151656 100644 --- a/module/os/freebsd/zfs/zfs_dir.c +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 57889b7390e..e2222df123f 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -76,7 +76,6 @@ #include #include #include -#include #include #include #include @@ -1648,10 +1647,11 @@ zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) * and return buffer. * cr - credentials of caller. * ct - caller context - * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. + * ncookies- number of entries in cookies + * cookies - offsets to directory entries * * RETURN: 0 on success, error code on failure. * @@ -1669,7 +1669,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, { znode_t *zp = VTOZ(vp); iovec_t *iovp; - edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; @@ -1687,7 +1686,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, uint8_t type; int ncooks; cookie_t *cooks = NULL; - int flags = 0; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); @@ -1755,7 +1753,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } - eodp = (struct edirent *)odp; if (ncookies != NULL) { /* @@ -1824,25 +1821,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, type = ZFS_DIRENT_TYPE(zap.za_first_integer); } - if (flags & V_RDDIR_ACCFILTER) { - /* - * If we have no access at all, don't include - * this entry in the returned information - */ - znode_t *ezp; - if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) - goto skip_entry; - if (!zfs_has_access(ezp, cr)) { - vrele(ZTOV(ezp)); - goto skip_entry; - } - vrele(ZTOV(ezp)); - } - - if (flags & V_RDDIR_ENTFLAGS) - reclen = EDIRENT_RECLEN(strlen(zap.za_name)); - else - reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? @@ -1857,33 +1836,19 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, } break; } - if (flags & V_RDDIR_ENTFLAGS) { - /* - * Add extended flag entry: - */ - eodp->ed_ino = objnum; - eodp->ed_reclen = reclen; - /* NOTE: ed_off is the offset for the *next* entry */ - next = &(eodp->ed_off); - eodp->ed_eflags = zap.za_normalization_conflict ? - ED_CASE_CONFLICT : 0; - (void) strncpy(eodp->ed_name, zap.za_name, - EDIRENT_NAMELEN(reclen)); - eodp = (edirent_t *)((intptr_t)eodp + reclen); - } else { - /* - * Add normal entry: - */ - odp->d_ino = objnum; - odp->d_reclen = reclen; - odp->d_namlen = strlen(zap.za_name); - /* NOTE: d_off is the offset for the *next* entry. */ - next = &odp->d_off; - strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); - odp->d_type = type; - dirent_terminate(odp); - odp = (dirent64_t *)((intptr_t)odp + reclen); - } + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + /* NOTE: d_off is the offset for the *next* entry. */ + next = &odp->d_off; + strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + dirent_terminate(odp); + odp = (dirent64_t *)((intptr_t)odp + reclen); + outcount += reclen; ASSERT3S(outcount, <=, bufsize); @@ -1893,7 +1858,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); - skip_entry: /* * Move to the next entry, fill in the previous offset. */ From c50b3f14d33cd469af47e16f0c6c76f2b4b5158e Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Wed, 21 Sep 2022 03:19:05 +0500 Subject: [PATCH 62/69] Delay ZFS_PROP_SHARESMB property to handle it for encrypted raw receive For encrypted raw receive, objset creation is delayed until a call to dmu_recv_stream(). ZFS_PROP_SHARESMB property requires objset to be populated when calling zpl_earlier_version(). To correctly handle the ZFS_PROP_SHARESMB property for encrypted raw receive, this change delays setting the property. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Signed-off-by: Ameer Hamza Closes #13878 --- module/zfs/zfs_ioctl.c | 15 +++++++++++++++ .../functional/rsend/send_encrypted_props.ksh | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 6b9b43271ba..259d68c477d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4875,6 +4875,11 @@ extract_delay_props(nvlist_t *props) static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, + /* + * Setting ZFS_PROP_SHARESMB requires the objset type to be + * known, which is not possible prior to receipt of raw sends. + */ + ZFS_PROP_SHARESMB, 0 }; int i; @@ -4938,6 +4943,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; + nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; @@ -5052,6 +5058,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); + inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); @@ -5109,6 +5116,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } + if (inherited_delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, + inherited_delayprops, *errors); + } } /* @@ -5128,6 +5139,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } + if (inherited_delayprops != NULL) { + ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); + nvlist_free(inherited_delayprops); + } *read_bytes = off - noff; #ifdef ZFS_DEBUG diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh index 793904db91c..c0c7b682def 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh @@ -133,6 +133,14 @@ recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds +# Test that we can override sharesmb property for encrypted raw stream. +log_note "Must be able to override sharesmb property for encrypted raw stream" +ds=$TESTPOOL/recv +log_must eval "zfs send -w $esnap > $sendfile" +log_must eval "zfs recv -o sharesmb=on $ds < $sendfile" +log_must test "$(get_prop 'sharesmb' $ds)" == "on" +log_must zfs destroy -r $ds + # Test that we can override encryption properties on a properties stream # of an unencrypted dataset, turning it into an encryption root. log_note "Must be able to receive stream with props as encryption root" From de6c0d3d8cb279e0dd6a4831d85a9c45047908ba Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 20 Sep 2022 18:20:04 -0400 Subject: [PATCH 63/69] Fix potential NULL pointer dereference in zfsdle_vdev_online() Coverity complained about this. Reviewed-by: Brian Behlendorf Reviewed-by: Chunwei Chen Signed-off-by: Richard Yao Closes #13903 --- cmd/zed/agents/zfs_mod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index af6de73a1cc..53d9ababded 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -965,7 +965,7 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) nvlist_t *tgt; int error; - char *tmp_devname, devname[MAXPATHLEN]; + char *tmp_devname, devname[MAXPATHLEN] = ""; uint64_t guid; if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { From 7c6d94728c9f5deef7e7cc0bf4320385345de5bd Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 20 Sep 2022 18:20:56 -0400 Subject: [PATCH 64/69] Call va_end() before return in zpool_standard_error_fmt() Commit ecd6cf800b63704be73fb264c3f5b6e0dafc068d by marks in OpenSolaris at Tue Jun 26 07:44:24 2007 -0700 introduced a bug where we fail to call `va_end()` before returning. The man page for va_start() says: "Each invocation of va_start() must be matched by a corresponding invocation of va_end() in the same function." Coverity complained about this. Reviewed-by: Brian Behlendorf Reviewed-by: Chunwei Chen Signed-off-by: Richard Yao Closes #13904 --- lib/libzfs/libzfs_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 3067e8d4639..bc00a8dffd8 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -685,7 +685,7 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); - return (-1); + break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, From 3e5caef4c5b0cca3a892b92217955178ae8652bc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 21 Sep 2022 00:21:30 +0200 Subject: [PATCH 65/69] FreeBSD: catch up to 1400068 Reviewed-by: Ryan Moeller Signed-off-by: Mateusz Guzik Closes #13909 --- module/os/freebsd/zfs/zfs_vnops_os.c | 41 ++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index e2222df123f..fae390a148d 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -970,13 +970,17 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, case RENAME: if (error == ENOENT) { error = EJUSTRETURN; +#if __FreeBSD_version < 1400068 cnp->cn_flags |= SAVENAME; +#endif break; } zfs_fallthrough; case DELETE: +#if __FreeBSD_version < 1400068 if (error == 0) cnp->cn_flags |= SAVENAME; +#endif break; } } @@ -1326,7 +1330,10 @@ zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, cnp->cn_nameptr = __DECONST(char *, name); cnp->cn_namelen = strlen(name); cnp->cn_nameiop = nameiop; - cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_flags = ISLASTCN; +#if __FreeBSD_version < 1400068 + cnp->cn_flags |= SAVENAME; +#endif cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cnp->cn_cred = kcred; #if __FreeBSD_version < 1400037 @@ -4590,7 +4597,9 @@ zfs_freebsd_create(struct vop_create_args *ap) znode_t *zp = NULL; int rc, mode; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; @@ -4620,7 +4629,9 @@ static int zfs_freebsd_remove(struct vop_remove_args *ap) { +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); @@ -4642,7 +4653,9 @@ zfs_freebsd_mkdir(struct vop_mkdir_args *ap) znode_t *zp = NULL; int rc; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); *ap->a_vpp = NULL; @@ -4668,7 +4681,9 @@ zfs_freebsd_rmdir(struct vop_rmdir_args *ap) { struct componentname *cnp = ap->a_cnp; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } @@ -4922,8 +4937,10 @@ zfs_freebsd_rename(struct vop_rename_args *ap) vnode_t *tvp = ap->a_tvp; int error; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); +#endif error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); @@ -4959,7 +4976,9 @@ zfs_freebsd_symlink(struct vop_symlink_args *ap) #endif int rc; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); @@ -5053,7 +5072,9 @@ zfs_freebsd_link(struct vop_link_args *ap) if (tdvp->v_mount != vp->v_mount) return (EXDEV); +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_link(VTOZ(tdvp), VTOZ(vp), cnp->cn_nameptr, cnp->cn_cred, 0)); @@ -5325,10 +5346,10 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); #endif error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); @@ -5470,12 +5491,10 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) UIO_SYSSPACE, attrname, xvp); #endif error = namei(&nd); - vp = nd.ni_vp; - if (error != 0) { - NDFREE_PNBUF(&nd); + if (error != 0) return (SET_ERROR(error)); - } + vp = nd.ni_vp; error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE_PNBUF(&nd); @@ -5615,10 +5634,10 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) #endif error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); VATTR_NULL(&va); va.va_size = 0; @@ -5802,10 +5821,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) UIO_SYSSPACE, ".", xvp); #endif error = namei(&nd); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; From fbf874a4acd86a118a695fb695fe934e68fc6b6f Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 21 Sep 2022 00:22:32 +0200 Subject: [PATCH 66/69] FreeBSD: handle V_PCATCH See https://cgit.FreeBSD.org/src/commit/?id=a75d1ddd74312f5dd79bc1e965f7077679659f2e Reviewed-by: Ryan Moeller Reviewed-by: Alexander Motin Signed-off-by: Mateusz Guzik Closes #13910 --- module/os/freebsd/zfs/zfs_file_os.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index fd86a75416e..60c9ff0581e 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -226,7 +226,11 @@ zfs_vop_fsync(vnode_t *vp) struct mount *mp; int error; +#if __FreeBSD_version < 1400068 if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) +#else + if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0) +#endif goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, curthread); From 62e2a2881f6b441c136fb4ccb66ab491a5e6101f Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Tue, 20 Sep 2022 18:25:21 -0400 Subject: [PATCH 67/69] Fix minor issues in namespace delegation support get_user_ns() is only done once for each namespace, so put_user_ns() should be done once too. Fix two typos in user_namespace/user_namespace_002.ksh and user_namespace/user_namespace_003.ksh. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Youzhong Yang Closes #13918 --- module/os/linux/spl/spl-zone.c | 2 +- .../tests/functional/user_namespace/user_namespace_002.ksh | 2 +- .../tests/functional/user_namespace/user_namespace_003.ksh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/module/os/linux/spl/spl-zone.c b/module/os/linux/spl/spl-zone.c index b8a8b7cd8cd..234ae7f6cd0 100644 --- a/module/os/linux/spl/spl-zone.c +++ b/module/os/linux/spl/spl-zone.c @@ -415,8 +415,8 @@ spl_zone_fini(void) zone_dataset_t, zd_list); list_del(&zd->zd_list); kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); - put_user_ns(zds->zds_userns); } + put_user_ns(zds->zds_userns); list_del(&zds->zds_list); kmem_free(zds, sizeof (*zds)); } diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh index b04898fa81a..cfc478cd359 100755 --- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_002.ksh @@ -85,7 +85,7 @@ fi list="$($NSENTER zfs list -r -H -o name | tr '\n' ' ')" log_must test -z "$list" log_must zfs zone $proc_ns $TESTPOOL/userns -proc_ns_added="$ns" +proc_ns_added="$proc_ns" # 2. 'zfs list' list="$($NSENTER zfs list -r -H -o name $TESTPOOL | tr '\n' ' ')" diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_003.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_003.ksh index 2a875d09b6a..6a746c6d33f 100755 --- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_003.ksh +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_003.ksh @@ -88,7 +88,7 @@ list="$($NSENTER zfs list -r -H -o name | tr '\n' ' ')" log_must test -z "$list" log_must zfs zone $proc_ns $TESTPOOL/userns log_must zfs zone $proc_ns $TESTPOOL/otheruserns -proc_ns_added="$ns" +proc_ns_added="$proc_ns" # 2. 'zfs list' list="$($NSENTER zfs list -r -H -o name $TESTPOOL | tr '\n' ' ')" From 402426c7d81f410fa088c3bd893d4941a97d8332 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 21 Sep 2022 00:32:44 +0200 Subject: [PATCH 68/69] Add membar_sync Provides the missing full barrier variant to the membar primitive set. While not used right now, this is probably going to change down the road. Name taken from Solaris, to follow the existing routines. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Guzik Closes #13907 --- include/os/freebsd/spl/sys/atomic.h | 1 + include/os/linux/spl/sys/vmsystm.h | 1 + lib/libspl/atomic.c | 6 ++++++ lib/libspl/include/atomic.h | 7 +++++++ 4 files changed, 15 insertions(+) diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h index 01b13fc9afd..8b9cec15c5e 100644 --- a/include/os/freebsd/spl/sys/atomic.h +++ b/include/os/freebsd/spl/sys/atomic.h @@ -59,6 +59,7 @@ extern uint64_t atomic_cas_64(volatile uint64_t *target, uint64_t cmp, #define membar_consumer() atomic_thread_fence_acq() #define membar_producer() atomic_thread_fence_rel() +#define membar_sync() atomic_thread_fence_seq_cst() static __inline uint32_t atomic_add_32_nv(volatile uint32_t *target, int32_t delta) diff --git a/include/os/linux/spl/sys/vmsystm.h b/include/os/linux/spl/sys/vmsystm.h index fcd61e818fa..c6d99fb3183 100644 --- a/include/os/linux/spl/sys/vmsystm.h +++ b/include/os/linux/spl/sys/vmsystm.h @@ -46,6 +46,7 @@ #define membar_consumer() smp_rmb() #define membar_producer() smp_wmb() +#define membar_sync() smp_mb() #define physmem zfs_totalram_pages diff --git a/lib/libspl/atomic.c b/lib/libspl/atomic.c index ba14b113f58..8cc350710ba 100644 --- a/lib/libspl/atomic.c +++ b/lib/libspl/atomic.c @@ -381,6 +381,12 @@ membar_exit(void) __atomic_thread_fence(__ATOMIC_SEQ_CST); } +void +membar_sync(void) +{ + __atomic_thread_fence(__ATOMIC_SEQ_CST); +} + void membar_producer(void) { diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h index 1249d42b604..4ebdbbda986 100644 --- a/lib/libspl/include/atomic.h +++ b/lib/libspl/include/atomic.h @@ -313,6 +313,13 @@ extern void membar_enter(void); */ extern void membar_exit(void); +/* + * Make all stores and loads emitted prior to the the barrier complete before + * crossing it, while also making sure stores and loads emitted after the + * barrier only start being executed after crossing it. + */ +extern void membar_sync(void); + /* * Arrange that all stores issued before this point in the code reach * global visibility before any stores that follow; useful in producer From c629f0bf62e351355716f9870d6c2e377584b016 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 21 Sep 2022 00:34:41 +0200 Subject: [PATCH 69/69] Retire ZFS_TEARDOWN_TRY_ENTER_READ There were never any users and it so happens the operation is not even supported by rrm locks -- the macros were wrong for Linux and FreeBSD when not using it's RMS locks. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Guzik Closes #13906 --- include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 6 ------ include/os/linux/zfs/sys/zfs_vfsops_os.h | 3 --- 2 files changed, 9 deletions(-) diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index c44f7c6f06b..f765d38dbac 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -128,9 +128,6 @@ struct zfsvfs { #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rms_destroy(&(zfsvfs)->z_teardown_lock) -#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ - rms_try_rlock(&(zfsvfs)->z_teardown_lock) - #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rms_rlock(&(zfsvfs)->z_teardown_lock); @@ -161,9 +158,6 @@ struct zfsvfs { #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rrm_destroy(&(zfsvfs)->z_teardown_lock) -#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ - rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) - #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 697ae2018ec..e320b8de422 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -143,9 +143,6 @@ struct zfsvfs { #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rrm_destroy(&(zfsvfs)->z_teardown_lock) -#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ - rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) - #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag);