zed: Prefer spares with matching rotational and size

Before this change zed tried to activate spares just in order they
are stored in configuration, which is quite arbitrary.  To make
the result more optimal, sort the spares by their rotational status
and size, so that the most fitting ones have better chances.

To make it more visible, export the rotational status as a vdev
property.  While at it, minimally fix vdev properties reading for
spare and L2ARC vdevs, having no ZAPs.

To keep the rotational status for spare activation purposes when
failed device is already gone, save it into the vdev config.  The
same is for spare vdevs asize.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18597
This commit is contained in:
Alexander Motin
2026-05-28 18:14:26 -04:00
committed by GitHub
parent 3250b4393e
commit 472ddca116
14 changed files with 328 additions and 18 deletions
+78 -4
View File
@@ -350,6 +350,47 @@ is_draid_fdomain_failure(fmd_hdl_t *hdl, libzfs_handle_t *zhdl,
return (res);
}
/*
* Returns B_TRUE if spare 'a' should be tried before spare 'b' when
* replacing a failed vdev with the given characteristics.
*
* Ordering criteria (most to least significant):
* 1. Matching rotational is preferred over mismatching.
* 2. Large enough is preferred over (potentially?) too small.
* 3. Smaller size is preferred over bigger (best fit).
*/
static boolean_t
spare_is_preferred(nvlist_t *a, nvlist_t *b, boolean_t have_rotational,
uint64_t vdev_rotational, uint64_t vdev_size)
{
uint64_t a_rotational = 0, b_rotational = 0;
uint64_t a_size = 0, b_size = 0;
if (have_rotational) {
(void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_VDEV_ROTATIONAL,
&a_rotational);
(void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_VDEV_ROTATIONAL,
&b_rotational);
if ((a_rotational == vdev_rotational) !=
(b_rotational == vdev_rotational))
return (a_rotational == vdev_rotational);
}
vdev_stat_t *vs;
unsigned int c;
if (nvlist_lookup_uint64_array(a, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0)
a_size = vs->vs_rsize;
if (nvlist_lookup_uint64_array(b, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0)
b_size = vs->vs_rsize;
boolean_t a_ok = (a_size >= vdev_size);
boolean_t b_ok = (b_size >= vdev_size);
if (a_ok != b_ok)
return (a_ok);
return (a_size < b_size);
}
/*
* Given a vdev, attempt to replace it with every known spare until one
* succeeds or we run out of devices to try.
@@ -364,6 +405,10 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
char *dev_name;
zprop_source_t source;
int ashift;
uint64_t vdev_rotational = 0, vdev_size = 0;
boolean_t have_vdev_rotational;
vdev_stat_t *vs;
unsigned int c;
config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
@@ -377,6 +422,34 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
&spares, &nspares) != 0)
return (B_FALSE);
/*
* Collect the failed vdev's parameters for optimal replacement.
*/
have_vdev_rotational = (nvlist_lookup_uint64(vdev,
ZPOOL_CONFIG_VDEV_ROTATIONAL, &vdev_rotational) == 0);
if (nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0)
vdev_size = vs->vs_rsize;
/*
* Build a sorted index array over the spares, so that better
* candicates are tried first.
*/
uint_t order[nspares];
for (s = 0; s < nspares; s++)
order[s] = s;
for (s = 1; s < nspares; s++) {
uint_t key = order[s];
int j = (int)s - 1;
while (j >= 0 && spare_is_preferred(spares[key],
spares[order[j]], have_vdev_rotational, vdev_rotational,
vdev_size)) {
order[j + 1] = order[j];
j--;
}
order[j + 1] = key;
}
/*
* lookup "ashift" pool property, we may need it for the replacement
*/
@@ -394,25 +467,26 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
* replace it.
*/
for (s = 0; s < nspares; s++) {
nvlist_t *spare = spares[order[s]];
boolean_t rebuild = B_FALSE;
const char *spare_name, *type;
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
if (nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH,
&spare_name) != 0)
continue;
/* prefer sequential resilvering for distributed spares */
if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
if ((nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE,
&type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
rebuild = B_TRUE;
/* if set, add the "ashift" pool property to the spare nvlist */
if (source != ZPROP_SRC_DEFAULT)
(void) nvlist_add_uint64(spares[s],
(void) nvlist_add_uint64(spare,
ZPOOL_CONFIG_ASHIFT, ashift);
(void) nvlist_add_nvlist_array(replacement,
ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1);
ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spare, 1);
fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
dev_name, zfs_basename(spare_name));
+2
View File
@@ -478,6 +478,7 @@ typedef enum {
VDEV_PROP_FDOMAIN,
VDEV_PROP_FGROUP,
VDEV_PROP_ALLOC_BIAS,
VDEV_PROP_ROTATIONAL,
VDEV_NUM_PROPS
} vdev_prop_t;
@@ -931,6 +932,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_VDEV_ROTATIONAL "rotational"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
#define ZPOOL_CONFIG_SPARES "spares"
+2 -1
View File
@@ -6416,7 +6416,8 @@
<enumerator name='VDEV_PROP_FDOMAIN' value='56'/>
<enumerator name='VDEV_PROP_FGROUP' value='57'/>
<enumerator name='VDEV_PROP_ALLOC_BIAS' value='58'/>
<enumerator name='VDEV_NUM_PROPS' value='59'/>
<enumerator name='VDEV_PROP_ROTATIONAL' value='59'/>
<enumerator name='VDEV_NUM_PROPS' value='60'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
+2
View File
@@ -142,6 +142,8 @@ See
.Xr zpool-attach 8 .
.It Sy trim_support
Indicates if a leaf device supports trim operations.
.It Sy rotational
Indicates whether the device backing this vdev uses rotating media.
.El
.Pp
The following native properties can be used to change the behavior of a vdev.
+3
View File
@@ -574,6 +574,9 @@ vdev_prop_init(void)
VDEV_BIAS_NONE, PROP_DEFAULT, ZFS_TYPE_VDEV,
"none | log | special | dedup", "ALLOC_BIAS",
vdev_alloc_bias_table, sfeatures);
zprop_register_index(VDEV_PROP_ROTATIONAL, "rotational", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "ROTATIONAL",
boolean_table, sfeatures);
/* hidden properties */
zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
+12 -4
View File
@@ -8333,12 +8333,20 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
return (spa_vdev_exit(spa, newrootvd, txg, error));
/*
* log, dedup and special vdevs should not be replaced by spares.
* Spares can't replace logs
*/
if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* For special and dedup vdevs a spare must have matching rotational
* characteristics. A rotating spare replacing a non-rotating vdev
* would silently degrade pool performance, so we reject the mismatch.
*/
if (newvd->vdev_isspare &&
oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE &&
newvd->vdev_nonrot != oldvd->vdev_nonrot)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
/*
* A dRAID spare can only replace a child of its parent dRAID vdev.
+41 -7
View File
@@ -474,8 +474,11 @@ vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
uint64_t objid;
int err;
if (vdev_prop_get_objid(vd, &objid) != 0)
return (EINVAL);
if (vdev_prop_get_objid(vd, &objid) != 0) {
/* No ZAP: property was never set, return the default. */
*value = vdev_prop_default_numeric(prop);
return (ENOENT);
}
err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
sizeof (uint64_t), 1, value);
@@ -963,6 +966,20 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
/*
* Restore the last-known rotational status for leaf vdevs. vdev_open()
* will overwrite this with the hardware value when the device is
* accessible; the persisted value acts as a fallback for failed or
* missing devices so that spare selection can still match on device
* type even when the original disk is gone.
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t rotational = 0;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL,
&rotational) == 0)
vd->vdev_nonrot = !rotational;
}
vic = &vd->vdev_indirect_config;
ASSERT0(vic->vic_mapping_object);
@@ -6446,9 +6463,15 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
if (vdev_prop_get_objid(vd, &objid) != 0)
return (SET_ERROR(EINVAL));
ASSERT(objid != 0);
/*
* A missing ZAP is normal for spare and L2ARC vdevs, which are
* not part of the main vdev tree and never get ZAPs allocated.
* Many properties are sourced directly from vdev_t fields and
* work fine without one; ZAP-backed properties will return their
* default values. objid is set to 0 when absent and the few
* cases that call zap_lookup directly guard against this below.
*/
(void) vdev_prop_get_objid(vd, &objid);
mutex_enter(&spa->spa_props_lock);
@@ -6772,8 +6795,13 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_FAILFAST:
src = ZPROP_SRC_LOCAL;
err = zap_lookup(mos, objid, nvpair_name(elem),
sizeof (uint64_t), 1, &intval);
if (objid != 0) {
err = zap_lookup(mos, objid,
nvpair_name(elem),
sizeof (uint64_t), 1, &intval);
} else {
err = ENOENT;
}
if (err == ENOENT) {
if (vd->vdev_ops == &vdev_root_ops)
intval =
@@ -6835,6 +6863,10 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_ROTATIONAL:
vdev_prop_add_list(outnvl, propname, NULL,
!vd->vdev_nonrot, ZPROP_SRC_NONE);
continue;
case VDEV_PROP_CHECKSUM_N:
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
@@ -6860,6 +6892,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
/* FALLTHRU */
case VDEV_PROP_USERPROP:
/* User Properites */
if (objid == 0)
continue;
src = ZPROP_SRC_LOCAL;
err = zap_length(mos, objid, nvpair_name(elem),
+8
View File
@@ -493,6 +493,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_wholedisk);
}
if (vd->vdev_ops->vdev_op_leaf) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL,
!vd->vdev_nonrot);
}
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
@@ -502,6 +507,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (flags & VDEV_CONFIG_L2CACHE)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
if ((flags & VDEV_CONFIG_SPARE) && vd->vdev_asize != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+1 -1
View File
@@ -1111,7 +1111,7 @@ tags = ['functional', 'vdev_disk']
[tests/functional/vdev_zaps]
tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
'vdev_zaps_007_pos']
'vdev_zaps_007_pos', 'vdev_zaps_008_pos']
tags = ['functional', 'vdev_zaps']
[tests/functional/write_dirs]
+2 -1
View File
@@ -118,7 +118,8 @@ tags = ['functional', 'fallocate']
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple',
'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault',
'auto_spare_ashift', 'auto_spare_rotational', 'auto_spare_shared',
'decrypt_fault',
'decompress_fault', 'fault_limits', 'scrub_after_resilver',
'suspend_on_probe_errors', 'suspend_resume_single', 'suspend_draid_fgroups',
'zpool_status_-s']
+2
View File
@@ -1620,6 +1620,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/fault/auto_spare_001_pos.ksh \
functional/fault/auto_spare_002_pos.ksh \
functional/fault/auto_spare_ashift.ksh \
functional/fault/auto_spare_rotational.ksh \
functional/fault/auto_spare_double.ksh \
functional/fault/auto_spare_multiple.ksh \
functional/fault/auto_spare_shared.ksh \
@@ -2292,6 +2293,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/vdev_zaps/vdev_zaps_005_pos.ksh \
functional/vdev_zaps/vdev_zaps_006_pos.ksh \
functional/vdev_zaps/vdev_zaps_007_pos.ksh \
functional/vdev_zaps/vdev_zaps_008_pos.ksh \
functional/write_dirs/cleanup.ksh \
functional/write_dirs/setup.ksh \
functional/write_dirs/write_dirs_001_pos.ksh \
@@ -66,6 +66,7 @@ typeset -a properties=(
trim_bytes
removing
allocating
rotational
failfast
checksum_n
checksum_t
@@ -0,0 +1,84 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2026, TrueNAS.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/fault/fault.cfg
#
# DESCRIPTION:
# ZED prefers the smallest sufficient spare when replacing a faulted
# special vdev, regardless of spare list order.
#
# The 'rotational' property is persisted in the pool config for all leaf
# vdevs so that spare selection can match device type even after the
# original disk is gone. ZED sorts spares preferring matching rotational
# and, among equally-matching spares, the smallest sufficient one.
#
# STRATEGY:
# 1. Create a pool with a normal mirror, a special mirror, and two file
# spares of different sizes. List the larger spare first so that the
# sorted order contradicts the list order.
# 2. Fault a member of the special mirror; verify ZED activates the
# smaller sufficient spare, leaving the larger spare available.
#
verify_runnable "both"
NORM1="$TEST_BASE_DIR/rotational-norm1"
NORM2="$TEST_BASE_DIR/rotational-norm2"
SPEC1="$TEST_BASE_DIR/rotational-spec1"
SPEC2="$TEST_BASE_DIR/rotational-spec2"
SPARE_SMALL="$TEST_BASE_DIR/rotational-spare-small"
SPARE_LARGE="$TEST_BASE_DIR/rotational-spare-large"
LARGE_SIZE=$((MINVDEVSIZE * 2))
function cleanup
{
log_must zinject -c all
destroy_pool $TESTPOOL
rm -f $NORM1 $NORM2 $SPEC1 $SPEC2 $SPARE_SMALL $SPARE_LARGE
}
log_assert "ZED selects smallest sufficient spare for a faulted special vdev"
log_onexit cleanup
zed_events_drain
log_must truncate -s $MINVDEVSIZE $NORM1 $NORM2 $SPEC1 $SPEC2 $SPARE_SMALL
log_must truncate -s $LARGE_SIZE $SPARE_LARGE
# SPARE_LARGE is listed first so that size-preference sorting is what
# causes SPARE_SMALL to be selected, not merely list order.
log_must zpool create -f $TESTPOOL \
mirror $NORM1 $NORM2 \
special mirror $SPEC1 $SPEC2 \
spare $SPARE_LARGE $SPARE_SMALL
log_must zinject -d $SPEC1 -e io -T all -f 100 $TESTPOOL
log_must zpool scrub $TESTPOOL
log_note "Wait for ZED to auto-spare the special vdev"
log_must wait_vdev_state $TESTPOOL $SPEC1 "FAULTED" 60
log_must wait_hotspare_state $TESTPOOL $SPARE_SMALL "INUSE"
# The larger spare must not have been activated.
log_must wait_hotspare_state $TESTPOOL $SPARE_LARGE "AVAIL"
log_must check_state $TESTPOOL "" "DEGRADED"
log_pass "ZED activated the smallest sufficient spare for the special vdev"
@@ -0,0 +1,90 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2026, TrueNAS.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Verify that the 'rotational' vdev property is readable on spare and
# L2ARC vdevs, which have no per-vdev ZAP, and that its value persists
# across export/import when the spare device is absent.
#
# STRATEGY:
# 1. Create a pool with a mirror, a spare, and an L2ARC device.
# 2. Verify 'rotational' is readable on leaf, virtual (mirror), spare,
# and L2ARC vdevs.
# 3. Export the pool, remove the spare file, re-import, and verify that
# 'rotational' still reports the same value for the missing spare,
# proving the value comes from the persisted config.
#
verify_runnable "global"
SPARE="$TEST_BASE_DIR/vz008-spare"
L2C="$TEST_BASE_DIR/vz008-l2c"
VDEV1="$TEST_BASE_DIR/vz008-vdev1"
VDEV2="$TEST_BASE_DIR/vz008-vdev2"
function cleanup
{
destroy_pool $TESTPOOL
rm -f $VDEV1 $VDEV2 $SPARE $L2C
}
log_assert "'rotational' is readable on ZAP-less vdevs and persists absent"
log_onexit cleanup
log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 $SPARE $L2C
log_must zpool create -f $TESTPOOL \
mirror $VDEV1 $VDEV2 \
cache $L2C \
spare $SPARE
# Leaf vdev should report rotational.
NR=$(zpool get -H -o value rotational $TESTPOOL $VDEV1)
[[ "$NR" == "on" || "$NR" == "off" ]] ||
log_fail "leaf $VDEV1: expected on/off, got '$NR'"
# Virtual (mirror) vdev should report rotational.
MIRROR=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}')
NR=$(zpool get -H -o value rotational $TESTPOOL "$MIRROR")
[[ "$NR" == "on" || "$NR" == "off" ]] ||
log_fail "mirror: expected on/off, got '$NR'"
# Spare vdev should report rotational even though it has no ZAP.
NR=$(zpool get -H -o value rotational $TESTPOOL $SPARE)
[[ "$NR" == "on" || "$NR" == "off" ]] ||
log_fail "spare $SPARE: expected on/off, got '$NR'"
# L2ARC vdev should report rotational even though it has no ZAP.
NR=$(zpool get -H -o value rotational $TESTPOOL $L2C)
[[ "$NR" == "on" || "$NR" == "off" ]] ||
log_fail "L2ARC $L2C: expected on/off, got '$NR'"
# The value must persist across export/import when the spare is absent.
# Remove the spare file before re-import so that vdev_open() cannot read
# the hardware value and the only source is the persisted config.
NR_BEFORE=$(zpool get -H -o value rotational $TESTPOOL $SPARE)
log_must zpool export $TESTPOOL
log_must rm -f $SPARE
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL
NR_AFTER=$(zpool get -H -o value rotational $TESTPOOL $SPARE)
[[ "$NR_BEFORE" == "$NR_AFTER" ]] ||
log_fail "spare rotational changed across import: $NR_BEFORE -> $NR_AFTER"
log_pass "'rotational' readable on spare/L2ARC vdevs and persists when absent"