From d65015938e195370930c501d26f9c73ca6a4c4fe Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 7 May 2026 12:16:39 -0400 Subject: [PATCH] Vdev allocation bias/class change Normal, special and dedup vdevs differ only by space allocation bias. Normal and special vdevs might even legally store blocks targeted to other classes. Dedup vdevs don't normally do it, but there is no real reason why they can't. Considering this, it is not impossible to change the allocation bias for those vdevs. This change introduces a new top-level vdev property -- alloc_bias, reporting current bias for the vdev, and allowing to change it. This allows to easily change vdev role in a pool, especially if vdev removal is impossible. To not complicate the code, changes take effect only on next pool import. Changes to/from log vdev could also be theoretically possible, but they are artificially blocked for now, partially due to additional complications, and partially due to potential danger of placing other blocks on log vdevs, that would otherwise be non-fatal. Reviewed-by: Brian Behlendorf Reviewed-by: Alek Pinchuk Signed-off-by: Alexander Motin Closes #18493 --- include/sys/fs/zfs.h | 11 ++ include/sys/vdev_impl.h | 8 -- lib/libzfs/libzfs.abi | 8 +- lib/libzfs/libzfs_pool.c | 3 + man/man7/vdevprops.7 | 15 +++ module/zcommon/zpool_prop.c | 12 ++ module/zfs/vdev.c | 77 +++++++++++++ module/zfs/zfs_ioctl.c | 6 + tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 2 + .../alloc_class/alloc_class_014_pos.ksh | 109 ++++++++++++++++++ .../alloc_class/alloc_class_015_neg.ksh | 91 +++++++++++++++ 12 files changed, 334 insertions(+), 11 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index fcef32ecf9f..d9b6e7654b0 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -476,6 +476,7 @@ typedef enum { VDEV_PROP_SCHEDULER, VDEV_PROP_FDOMAIN, VDEV_PROP_FGROUP, + VDEV_PROP_ALLOC_BIAS, VDEV_NUM_PROPS } vdev_prop_t; @@ -491,6 +492,16 @@ typedef enum { VDEV_SCHEDULER_OFF } vdev_scheduler_type_t; +/* + * Allocation bias for top-level vdevs (alloc_bias property). + */ +typedef enum vdev_alloc_bias { + VDEV_BIAS_NONE, + VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ + VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ + VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ +} vdev_alloc_bias_t; + /* * Dataset property functions shared between libzfs and kernel. */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 634594aca12..3c19b9abe9c 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -155,14 +155,6 @@ struct vdev_queue { kmutex_t vq_lock; }; -typedef enum vdev_alloc_bias { - VDEV_BIAS_NONE, - VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ - VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ - VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ -} vdev_alloc_bias_t; - - /* * On-disk indirect vdev state. * diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index ad28c876630..be74babbcba 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -2553,7 +2553,7 @@ - + @@ -2605,6 +2605,9 @@ + + + @@ -6412,7 +6415,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 0b015d8bce6..fd957d98313 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5741,6 +5741,9 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, return (ENOENT); if (prop == VDEV_PROP_SIT_OUT) return (ENOENT); + /* Only valid for top-level vdevs */ + if (prop == VDEV_PROP_ALLOC_BIAS) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index b45128dd924..5f5e10723c1 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -218,6 +218,21 @@ If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . +.It Sy alloc_bias Ns = Ns Sy none Ns | Ns Sy log Ns | Ns Sy special Ns | Ns Sy dedup +Controls the allocation class for a top-level vdev. +Changes take effect after an export and import of the pool. +Changing to/from log is not implemented, since it may lead to data loss in +case of the log device failure. +Setting to +.Sy special +and +.Sy dedup +requires +.Sy feature@allocation_classes +to be enabled. +At least one top-level vdev must remain in the normal +.Pq Sy none +class. .It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off Controls how I/O requests are added to the vdev queue when reading or writing to this vdev. diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index ee86fe0c717..13a1390d1e1 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -388,6 +388,14 @@ vdev_prop_init(void) { NULL } }; + static const zprop_index_t vdev_alloc_bias_table[] = { + { "none", VDEV_BIAS_NONE }, + { "log", VDEV_BIAS_LOG }, + { "special", VDEV_BIAS_SPECIAL }, + { "dedup", VDEV_BIAS_DEDUP }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES); @@ -556,6 +564,10 @@ vdev_prop_init(void) VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV, "auto | on | off", "IO_SCHEDULER", vdevschedulertype_table, sfeatures); + zprop_register_index(VDEV_PROP_ALLOC_BIAS, "alloc_bias", + VDEV_BIAS_NONE, PROP_DEFAULT, ZFS_TYPE_VDEV, + "none | log | special | dedup", "ALLOC_BIAS", + vdev_alloc_bias_table, sfeatures); /* hidden properties */ zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4cc75ad5a25..9f083cd510f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6093,6 +6093,29 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) strval); } break; + case VDEV_PROP_ALLOC_BIAS: { + intval = fnvpair_value_uint64(elem); + ASSERT3U(intval, !=, VDEV_BIAS_LOG); + const char *bias_str = + (intval == VDEV_BIAS_SPECIAL) ? + VDEV_ALLOC_BIAS_SPECIAL : + (intval == VDEV_BIAS_DEDUP) ? + VDEV_ALLOC_BIAS_DEDUP : NULL; + if (bias_str == NULL) { + (void) zap_remove(mos, objid, + VDEV_TOP_ZAP_ALLOCATION_BIAS, tx); + } else { + VERIFY0(zap_update(mos, objid, + VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(bias_str) + 1, bias_str, tx)); + spa_activate_allocation_classes(spa, tx); + } + spa_history_log_internal(spa, "vdev set", tx, + "vdev_guid=%llu: alloc_bias=%s", + (u_longlong_t)vdev_guid, + bias_str != NULL ? bias_str : "none"); + break; + } default: /* normalize the property name */ propname = vdev_prop_to_name(prop); @@ -6319,6 +6342,53 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_scheduler = intval; break; + case VDEV_PROP_ALLOC_BIAS: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (vd != vd->vdev_top || vd->vdev_top_zap == 0) { + error = ENOTSUP; + break; + } + /* Log vdevs are not supported: remove and re-add. */ + if (vd->vdev_islog) { + error = ENOTSUP; + break; + } + /* special/dedup needs allocation_classes feature */ + if (intval != VDEV_BIAS_NONE && + ((intval != VDEV_BIAS_SPECIAL && + intval != VDEV_BIAS_DEDUP) || + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES))) { + error = ENOTSUP; + break; + } + /* + * Disallow converting the last normal vdev to + * avoid pool suspension on failed allocations. + */ + if (intval != VDEV_BIAS_NONE && + vd->vdev_alloc_bias == VDEV_BIAS_NONE) { + vdev_t *rvd = spa->spa_root_vdev; + int normal = 0; + for (uint64_t c = 0; + c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + if (vdev_is_concrete(cvd) && + cvd->vdev_alloc_bias == + VDEV_BIAS_NONE && + !cvd->vdev_noalloc) + normal++; + } + if (normal <= 1) { + error = ENOTSUP; + break; + } + } + vd->vdev_alloc_bias = (vdev_alloc_bias_t)intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6746,6 +6816,13 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, boolval, src); break; + case VDEV_PROP_ALLOC_BIAS: + if (vd == vd->vdev_top) { + vdev_prop_add_list(outnvl, propname, + NULL, vd->vdev_alloc_bias, + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index fe98e7db073..d31aa80641c 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3456,12 +3456,15 @@ zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) ASSERT(spa_writeable(spa)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_set(vd, innvl, outnvl); + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); @@ -3500,12 +3503,15 @@ zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_get(vd, innvl, outnvl); + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 4c7e4e85ec0..df80437ad0c 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -37,7 +37,8 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', - 'alloc_class_013_pos', 'alloc_class_016_pos'] + 'alloc_class_013_pos', 'alloc_class_014_pos', 'alloc_class_015_neg', + 'alloc_class_016_pos'] tags = ['functional', 'alloc_class'] [tests/functional/append] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index c4bcfea5595..a6242ba0f52 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -434,6 +434,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ + functional/alloc_class/alloc_class_014_pos.ksh \ + functional/alloc_class/alloc_class_015_neg.ksh \ functional/alloc_class/alloc_class_016_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh new file mode 100755 index 00000000000..27c55bc5906 --- /dev/null +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib + +# +# DESCRIPTION: +# The alloc_bias vdev property is readable and settable on top-level vdevs. +# +# STRATEGY: +# 1. Create a pool with one normal mirror and one special mirror. +# 2. Verify alloc_bias getter returns "none" for normal and "special" +# for the special mirror. +# 3. Verify alloc_bias is not reported for leaf (child) vdevs. +# 4. Set alloc_bias=none on the special vdev; verify getter returns "none". +# 5. Export and import the pool; verify no "special" section in status. +# 6. Set alloc_bias=dedup on the same vdev; verify getter returns "dedup". +# 7. Export and import the pool; verify "dedup" section appears in status. +# 8. Set alloc_bias=special; verify getter returns "special". +# 9. Export and import; verify "special" section appears again. +# + +verify_runnable "global" + +claim="alloc_bias vdev property is readable and settable on top-level vdevs" + +log_assert $claim +log_onexit cleanup + +log_must disk_setup + +# One normal mirror (always stays normal) and one special mirror. +# The normal mirror ensures the pool always has normal-class vdevs +# regardless of what we do to the second mirror. +log_must zpool create $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + +# Find the special vdev name (mirror-N) from zpool status. +TVDEV=$(zpool status $TESTPOOL | \ + awk '/special/{found=1} found && /mirror-/{print $1; exit}') +log_note "Special vdev: $TVDEV" +[[ -n "$TVDEV" ]] || log_fail "Could not determine special vdev name" + +# Verify initial alloc_bias values. +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL mirror-0) +[[ "$BIAS" == "none" ]] || \ + log_fail "Normal mirror alloc_bias: expected none, got $BIAS" + +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "special" ]] || \ + log_fail "Special mirror alloc_bias: expected special, got $BIAS" + +# Verify alloc_bias is not reported for a leaf vdev. +LEAF_OUT=$(zpool get -H -o name,value alloc_bias $TESTPOOL \ + $ZPOOL_DISK0 2>&1) +[[ -z "$LEAF_OUT" ]] || \ + log_fail "alloc_bias reported for leaf vdev, got: $LEAF_OUT" + +# --- special -> none, verify after export/import --- +log_must zpool set alloc_bias=none $TESTPOOL $TVDEV +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "none" ]] || \ + log_fail "After set none: alloc_bias expected none, got $BIAS" + +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL +zpool status $TESTPOOL | grep -q "special" && \ + log_fail "special still shown after alloc_bias=none + reimport" + +# --- none -> dedup, verify after export/import --- +log_must zpool set alloc_bias=dedup $TESTPOOL $TVDEV +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "dedup" ]] || \ + log_fail "After set dedup alloc_bias expected dedup, got $BIAS" + +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL +zpool status $TESTPOOL | grep -q "dedup" || \ + log_fail "dedup not shown after alloc_bias=dedup + reimport" + +# --- dedup -> special, verify after export/import --- +log_must zpool set alloc_bias=special $TESTPOOL $TVDEV +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "special" ]] || \ + log_fail "After set special alloc_bias expected special, got $BIAS" + +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL +zpool status $TESTPOOL | grep -q "special" || \ + log_fail "special not shown after alloc_bias=special + reimport" + +log_must zpool destroy -f $TESTPOOL +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh new file mode 100755 index 00000000000..43740690b3c --- /dev/null +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib + +# +# DESCRIPTION: +# Setting the alloc_bias vdev property to invalid values or on +# unsupported vdev types fails. +# +# STRATEGY: +# 1. Create a pool with a normal mirror and a log vdev. +# 2. Verify setting alloc_bias on a leaf vdev fails. +# 3. Verify setting alloc_bias=log fails. +# 4. Verify setting alloc_bias to an unknown value fails. +# 5. Verify setting alloc_bias on a log vdev fails. +# 6. Verify setting alloc_bias=special fails when allocation_classes +# feature is not enabled. +# 7. Verify converting the last normal vdev fails. +# + +verify_runnable "global" + +claim="Setting alloc_bias to invalid values or on unsupported vdevs fails" + +log_assert $claim +log_onexit cleanup + +log_must disk_setup + +# Create a pool with a normal mirror and a log vdev. +log_must zpool create $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + log $CLASS_DISK0 + +NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +log_note "Normal vdev: $NORMAL_VDEV" + +# Setting alloc_bias on a leaf vdev must fail. +log_mustnot zpool set alloc_bias=special $TESTPOOL $ZPOOL_DISK0 + +# Setting alloc_bias=log must fail (log vdevs must be removed and re-added). +log_mustnot zpool set alloc_bias=log $TESTPOOL $NORMAL_VDEV + +# Setting alloc_bias to an unknown value must fail. +log_mustnot zpool set alloc_bias=bogus $TESTPOOL $NORMAL_VDEV + +# Setting alloc_bias on a log vdev must fail. +# CLASS_DISK0 is a single-disk (non-mirror) top-level log vdev. +log_mustnot zpool set alloc_bias=special $TESTPOOL $CLASS_DISK0 + +log_must zpool destroy -f $TESTPOOL + +# Verify setting alloc_bias=special fails when allocation_classes is disabled. +# Create a pool with the allocation_classes feature explicitly disabled. +log_must zpool create -o feature@allocation_classes=disabled $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 + +NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +log_mustnot zpool set alloc_bias=special $TESTPOOL $NORMAL_VDEV +log_mustnot zpool set alloc_bias=dedup $TESTPOOL $NORMAL_VDEV + +log_must zpool destroy -f $TESTPOOL + +# Verify that converting the last normal-class top-level vdev fails. +# A pool must always retain at least one normal vdev. +log_must zpool create $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + +NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +log_mustnot zpool set alloc_bias=special $TESTPOOL $NORMAL_VDEV +log_mustnot zpool set alloc_bias=dedup $TESTPOOL $NORMAL_VDEV + +log_must zpool destroy -f $TESTPOOL +log_pass $claim