From 3e9347c9f793917d4c25a6a80ce0dde641303f43 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Sep 2025 09:31:10 -0700 Subject: [PATCH 01/23] ZTS: update upgrade_readonly_pool.ksh Modify the test case to use the `zfs mount` command instead of directly calling the mount command, create a dedicated dataset, and use the default mount point. These changes are intended to preserve the intent of the original test case and resolve some spurious mount failures which have been observed by the CI. Reviewed-by: Igor Kozhukhov Signed-off-by: Brian Behlendorf Closes #17785 --- tests/zfs-tests/tests/functional/upgrade/setup.ksh | 2 +- .../functional/upgrade/upgrade_readonly_pool.ksh | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/zfs-tests/tests/functional/upgrade/setup.ksh b/tests/zfs-tests/tests/functional/upgrade/setup.ksh index 26153aafbc0..0e79e9b8b70 100755 --- a/tests/zfs-tests/tests/functional/upgrade/setup.ksh +++ b/tests/zfs-tests/tests/functional/upgrade/setup.ksh @@ -39,6 +39,6 @@ verify_runnable "global" # create a pool without any features -log_must mkfile 128m $TMPDEV +log_must truncate -s $MINVDEVSIZE $TMPDEV log_pass diff --git a/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh b/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh index d6bd69b7e13..e81d0779468 100755 --- a/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh +++ b/tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh @@ -35,17 +35,19 @@ verify_runnable "global" -TESTFILE="$TESTDIR/file.bin" - log_assert "User accounting upgrade should not be executed on readonly pool" log_onexit cleanup_upgrade # 1. Create a pool with the feature@userobj_accounting disabled to simulate # a legacy pool from a previous ZFS version. -log_must zpool create -d -m $TESTDIR $TESTPOOL $TMPDEV +log_must zpool create -d $TESTPOOL $TMPDEV +log_must zfs create $TESTPOOL/$TESTFS + +MNTPNT=$(get_prop mountpoint $TESTPOOL/$TESTFS) +TESTFILE="$MNTPNT/file.bin" # 2. Create a file on the "legecy" dataset -log_must touch $TESTDIR/file.bin +log_must touch $TESTFILE # 3. Enable feature@userobj_accounting on the pool and verify it is only # "enabled" and not "active": upgrading starts when the filesystem is mounted @@ -54,12 +56,12 @@ log_must test "enabled" == "$(get_pool_prop 'feature@userobj_accounting' $TESTPO # 4. Export the pool and re-import is readonly, without mounting any filesystem log_must zpool export $TESTPOOL -log_must zpool import -o readonly=on -N -d "$(dirname $TMPDEV)" $TESTPOOL +log_must zpool import -o readonly=on -N -d $TEST_BASE_DIR $TESTPOOL # 5. Try to mount the root dataset manually without the "ro" option, then verify # filesystem status and the pool feature status (not "active") to ensure the # pool "readonly" status is enforced. -log_must mount -t zfs -o zfsutil $TESTPOOL $TESTDIR +log_must zfs mount -R $TESTPOOL log_must stat "$TESTFILE" log_mustnot touch "$TESTFILE" log_must test "enabled" == "$(get_pool_prop 'feature@userobj_accounting' $TESTPOOL)" From 0e1a53a8c0edcb524245ac144191b6b52bbeea63 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Sep 2025 09:32:59 -0700 Subject: [PATCH 02/23] Fix 'zpool add' safety check corner cases Three cases were discovered where 'zpool add' would fail to warn when adding vdevs to a pool with a mismatched replication level. These are: 1. When a pool contains mixed file and disk vdevs. 2. When a pool contains an active dRAID distributed spare 3. When a pool contains an active hot spare The lack of warnings are caused by get_replication() assessing the current pool configuration an inconsistent and disabling the mismatched replication check for the new pool configuration after 'zpool add'. This change updates get_replication() to be slightly more tolerant in the non-fatal case. The zpool_add_010_pos.ksh test case was split in to separate tests: zpool_add_warn_create.ksh, pool_add_warn_degraded.ksh, and zpool_add_warn_removal. These test were extended to include coverage for dRAID pools and the three scenarios described above. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #17780 --- cmd/zpool/zpool_vdev.c | 28 ++- tests/runfiles/common.run | 5 +- tests/zfs-tests/tests/Makefile.am | 4 +- .../cli_root/zpool_add/zpool_add.kshlib | 42 ++++ ..._010_pos.ksh => zpool_add_warn_create.ksh} | 101 ++------- .../zpool_add/zpool_add_warn_degraded.ksh | 204 ++++++++++++++++++ .../zpool_add/zpool_add_warn_removal.ksh | 126 +++++++++++ 7 files changed, 418 insertions(+), 92 deletions(-) rename tests/zfs-tests/tests/functional/cli_root/zpool_add/{zpool_add_010_pos.ksh => zpool_add_warn_create.ksh} (60%) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 684b46a2d67..088c0108e91 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -609,22 +609,28 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) ZPOOL_CONFIG_PATH, &path) == 0); /* - * If we have a raidz/mirror that combines disks - * with files, report it as an error. + * Skip active spares they should never cause + * the pool to be evaluated as inconsistent. */ - if (!dontreport && type != NULL && + if (is_spare(NULL, path)) + continue; + + /* + * If we have a raidz/mirror that combines disks + * with files, only report it as an error when + * fatal is set to ensure all the replication + * checks aren't skipped in check_replication(). + */ + if (fatal && !dontreport && type != NULL && strcmp(type, childtype) != 0) { if (ret != NULL) free(ret); ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication " - "level: %s contains both " - "files and devices\n"), - rep.zprl_type); - else - return (NULL); + vdev_error(gettext( + "mismatched replication " + "level: %s contains both " + "files and devices\n"), + rep.zprl_type); dontreport = B_TRUE; } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 2b002830c82..d1be502f160 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -395,8 +395,9 @@ tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', - 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] + 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_warn_create', + 'zpool_add_warn_degraded', 'zpool_add_warn_removal', 'add-o_ashift', + 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 1517f90e99a..2ccc58981bd 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1027,7 +1027,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ - functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_create.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh \ + functional/cli_root/zpool_add/zpool_add_warn_removal.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib index 091d65bb4f3..74780bb0214 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib @@ -27,6 +27,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib @@ -89,3 +90,44 @@ function save_dump_dev fi echo $dumpdev } + +function zpool_create_add_setup +{ + typeset -i i=0 + + while ((i < 10)); do + log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i + + eval vdev$i=$TEST_BASE_DIR/vdev$i + ((i += 1)) + done + + if is_linux; then + vdev_lo="$(losetup -f "$vdev4" --show)" + elif is_freebsd; then + vdev_lo=/dev/"$(mdconfig -a -t vnode -f "$vdev4")" + else + vdev_lo="$(lofiadm -a "$vdev4")" + fi +} + +function zpool_create_add_cleanup +{ + datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + + if [[ -e $vdev_lo ]]; then + if is_linux; then + log_must losetup -d "$vdev_lo" + elif is_freebsd; then + log_must mdconfig -d -u "$vdev_lo" + else + log_must lofiadm -d "$vdev_lo" + fi + fi + + typeset -i i=0 + while ((i < 10)); do + rm -f $TEST_BASE_DIR/vdev$i + ((i += 1)) + done +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh similarity index 60% rename from tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh rename to tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh index df085a2ec74..661e55998d8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_create.ksh @@ -23,67 +23,51 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -# -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib # # DESCRIPTION: -# Verify zpool add succeed when adding vdevs with matching redundancy. +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy for a healthy pool. # # STRATEGY: # 1. Create several files == $MINVDEVSIZE. # 2. Verify 'zpool add' succeeds with matching redundancy. # 3. Verify 'zpool add' warns with differing redundancy. -# 4. Verify 'zpool add' warns with differing redundancy after removal. # verify_runnable "global" -function cleanup -{ - datasetexists $TESTPOOL1 && destroy_pool $TESTPOOL1 +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup - typeset -i i=0 - while ((i < 10)); do - rm -f $TEST_BASE_DIR/vdev$i - ((i += 1)) - done -} +zpool_create_add_setup - -log_assert "Verify 'zpool add' succeed with keywords combination." -log_onexit cleanup - -# 1. Create several files == $MINVDEVSIZE. typeset -i i=0 -while ((i < 10)); do - log_must truncate -s $MINVDEVSIZE $TEST_BASE_DIR/vdev$i - - eval vdev$i=$TEST_BASE_DIR/vdev$i - ((i += 1)) -done +typeset -i j=0 set -A redundancy0_create_args \ "$vdev0" set -A redundancy1_create_args \ "mirror $vdev0 $vdev1" \ - "raidz1 $vdev0 $vdev1" + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" set -A redundancy2_create_args \ "mirror $vdev0 $vdev1 $vdev2" \ - "raidz2 $vdev0 $vdev1 $vdev2" + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" set -A redundancy3_create_args \ "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ - "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" set -A redundancy0_add_args \ "$vdev5" \ @@ -93,21 +77,19 @@ set -A redundancy1_add_args \ "mirror $vdev5 $vdev6" \ "raidz1 $vdev5 $vdev6" \ "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ - "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" + "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \ + "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8" set -A redundancy2_add_args \ "mirror $vdev5 $vdev6 $vdev7" \ - "raidz2 $vdev5 $vdev6 $vdev7" + "raidz2 $vdev5 $vdev6 $vdev7" \ + "draid2 $vdev5 $vdev6 $vdev7" set -A redundancy3_add_args \ "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ - "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" - -set -A log_args "log" "$vdev4" -set -A cache_args "cache" "$vdev4" -set -A spare_args "spare" "$vdev4" - -typeset -i j=0 + "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid3 $vdev5 $vdev6 $vdev7 $vdev8" function zpool_create_add { @@ -148,30 +130,6 @@ function zpool_create_forced_add done } -function zpool_create_rm_add -{ - typeset -n create_args=$1 - typeset -n add_args=$2 - typeset -n rm_args=$3 - - i=0 - while ((i < ${#create_args[@]})); do - j=0 - while ((j < ${#add_args[@]})); do - log_must zpool create $TESTPOOL1 ${create_args[$i]} - log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} - log_must zpool add $TESTPOOL1 ${add_args[$j]} - log_must zpool remove $TESTPOOL1 ${rm_args[1]} - log_mustnot zpool add $TESTPOOL1 ${rm_args[1]} - log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} - log_must zpool destroy -f $TESTPOOL1 - - ((j += 1)) - done - ((i += 1)) - done -} - # 2. Verify 'zpool add' succeeds with matching redundancy. zpool_create_add redundancy0_create_args redundancy0_add_args zpool_create_add redundancy1_create_args redundancy1_add_args @@ -195,17 +153,4 @@ zpool_create_forced_add redundancy3_create_args redundancy0_add_args zpool_create_forced_add redundancy3_create_args redundancy1_add_args zpool_create_forced_add redundancy3_create_args redundancy2_add_args -# 4. Verify 'zpool add' warns with differing redundancy after removal. -zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args - -zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args - -zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args -zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args -zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args - -log_pass "'zpool add' succeed with keywords combination." +log_pass "Verify 'zpool add' warns for differing redundancy." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh new file mode 100755 index 00000000000..313eb3666f2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh @@ -0,0 +1,204 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib + +# +# DESCRIPTION: +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy for a degraded pool. +# +# STRATEGY: +# 1. Create several files == $MINVDEVSIZE. +# 2. Verify 'zpool add' succeeds with matching redundancy +# 3. Verify 'zpool add' warns with differing redundancy when +# a. Degraded pool with replaced mismatch vdev (file vs disk) +# b. Degraded pool dRAID distributed spare active +# c. Degraded pool hot spare active +# + +verify_runnable "global" + +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup + +zpool_create_add_setup + +set -A redundancy1_create_args \ + "mirror $vdev0 $vdev1" \ + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" + +set -A redundancy2_create_args \ + "mirror $vdev0 $vdev1 $vdev2" \ + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" + +set -A redundancy3_create_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_add_args \ + "mirror $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \ + "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8" + +set -A redundancy2_add_args \ + "mirror $vdev5 $vdev6 $vdev7" \ + "raidz2 $vdev5 $vdev6 $vdev7" \ + "draid2 $vdev5 $vdev6 $vdev7" + +set -A redundancy3_add_args \ + "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ + "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid3 $vdev5 $vdev6 $vdev7 $vdev8" + +set -A redundancy1_create_draid_args \ + "draid1:1s $vdev0 $vdev1 $vdev2" + +set -A redundancy2_create_draid_args \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev3" + +set -A redundancy3_create_draid_args \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_create_spare_args \ + "mirror $vdev0 $vdev1 spare $vdev_lo" \ + "raidz1 $vdev0 $vdev1 spare $vdev_lo" \ + "draid1 $vdev0 $vdev1 spare $vdev_lo" + +set -A redundancy2_create_spare_args \ + "mirror $vdev0 $vdev1 $vdev2 spare $vdev_lo" \ + "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" \ + "draid2 $vdev0 $vdev1 $vdev2 spare $vdev_lo" + +set -A redundancy3_create_spare_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" \ + "draid3 $vdev0 $vdev1 $vdev2 $vdev3 spare $vdev_lo" + +set -A replace_args "$vdev1" "$vdev_lo" +set -A draid1_args "$vdev1" "draid1-0-0" +set -A draid2_args "$vdev1" "draid2-0-0" +set -A draid3_args "$vdev1" "draid3-0-0" + +typeset -i i=0 +typeset -i j=0 + +function zpool_create_degraded_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool offline -f $TESTPOOL1 ${rm_args[0]} + log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool destroy -f $TESTPOOL1 + log_must zpool labelclear -f ${rm_args[0]} + + ((j += 1)) + done + ((i += 1)) + done +} + +function zpool_create_forced_degraded_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool offline -f $TESTPOOL1 ${rm_args[0]} + log_must zpool replace -w $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_mustnot zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]} + log_must zpool destroy -f $TESTPOOL1 + log_must zpool labelclear -f ${rm_args[0]} + + ((j += 1)) + done + ((i += 1)) + done +} + +# 2. Verify 'zpool add' succeeds with matching redundancy and a degraded pool. +zpool_create_degraded_add redundancy1_create_args redundancy1_add_args replace_args +zpool_create_degraded_add redundancy2_create_args redundancy2_add_args replace_args +zpool_create_degraded_add redundancy3_create_args redundancy3_add_args replace_args + +# 3. Verify 'zpool add' warns with differing redundancy and a degraded pool. +# +# a. Degraded pool with replaced mismatch vdev (file vs disk) +zpool_create_forced_degraded_add redundancy1_create_args redundancy2_add_args replace_args +zpool_create_forced_degraded_add redundancy1_create_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy2_create_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy2_create_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy3_create_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy3_create_args redundancy2_add_args replace_args + +# b. Degraded pool dRAID distributed spare active + +zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy2_add_args draid1_args +zpool_create_forced_degraded_add redundancy1_create_draid_args redundancy3_add_args draid1_args + +zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy1_add_args draid2_args +zpool_create_forced_degraded_add redundancy2_create_draid_args redundancy3_add_args draid2_args + +zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy1_add_args draid3_args +zpool_create_forced_degraded_add redundancy3_create_draid_args redundancy2_add_args draid3_args + +# c. Degraded pool hot spare active +zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy2_add_args replace_args +zpool_create_forced_degraded_add redundancy1_create_spare_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy2_create_spare_args redundancy3_add_args replace_args + +zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy1_add_args replace_args +zpool_create_forced_degraded_add redundancy3_create_spare_args redundancy2_add_args replace_args + +log_pass "Verify 'zpool add' warns for differing redundancy." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh new file mode 100755 index 00000000000..782858e301a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_warn_removal.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2012, 2016 by Delphix. All rights reserved. +# Copyright 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib + +# +# DESCRIPTION: +# Verify zpool add succeeds when adding vdevs with matching redundancy +# and warns with differing redundancy after removal. +# +# STRATEGY: +# 1. Create several files == $MINVDEVSIZE. +# 2. Verify 'zpool add' warns with differing redundancy after removal. +# + +verify_runnable "global" + +log_assert "Verify 'zpool add' warns for differing redundancy." +log_onexit zpool_create_add_cleanup + +zpool_create_add_setup + +typeset -i i=0 +typeset -i j=0 + +set -A redundancy1_create_args \ + "mirror $vdev0 $vdev1" \ + "raidz1 $vdev0 $vdev1" \ + "draid1:1s $vdev0 $vdev1 $vdev9" + +set -A redundancy2_create_args \ + "mirror $vdev0 $vdev1 $vdev2" \ + "raidz2 $vdev0 $vdev1 $vdev2" \ + "draid2:1s $vdev0 $vdev1 $vdev2 $vdev9" + +set -A redundancy3_create_args \ + "mirror $vdev0 $vdev1 $vdev2 $vdev3" \ + "raidz3 $vdev0 $vdev1 $vdev2 $vdev3" \ + "draid3:1s $vdev0 $vdev1 $vdev2 $vdev3 $vdev9" + +set -A redundancy1_add_args \ + "mirror $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6" \ + "raidz1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 raidz1 $vdev7 $vdev8" \ + "draid1 $vdev5 $vdev6 mirror $vdev7 $vdev8" \ + "mirror $vdev5 $vdev6 draid1 $vdev7 $vdev8" + +set -A redundancy2_add_args \ + "mirror $vdev5 $vdev6 $vdev7" \ + "raidz2 $vdev5 $vdev6 $vdev7" \ + "draid2 $vdev5 $vdev6 $vdev7" + +set -A redundancy3_add_args \ + "mirror $vdev5 $vdev6 $vdev7 $vdev8" \ + "raidz3 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid3 $vdev5 $vdev6 $vdev7 $vdev8" + +set -A log_args "log" "$vdev_lo" +set -A cache_args "cache" "$vdev_lo" +set -A spare_args "spare" "$vdev_lo" + + +function zpool_create_rm_add +{ + typeset -n create_args=$1 + typeset -n add_args=$2 + typeset -n rm_args=$3 + + i=0 + while ((i < ${#create_args[@]})); do + j=0 + while ((j < ${#add_args[@]})); do + log_must zpool create $TESTPOOL1 ${create_args[$i]} + log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${add_args[$j]} + log_must zpool remove $TESTPOOL1 ${rm_args[1]} + log_mustnot zpool add $TESTPOOL1 ${rm_args[1]} + log_must zpool add $TESTPOOL1 ${rm_args[0]} ${rm_args[1]} + log_must zpool destroy -f $TESTPOOL1 + + ((j += 1)) + done + ((i += 1)) + done +} + +# 2. Verify 'zpool add' warns with differing redundancy after removal. +zpool_create_rm_add redundancy1_create_args redundancy1_add_args log_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args log_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args log_args + +zpool_create_rm_add redundancy1_create_args redundancy1_add_args cache_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args cache_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args cache_args + +zpool_create_rm_add redundancy1_create_args redundancy1_add_args spare_args +zpool_create_rm_add redundancy2_create_args redundancy2_add_args spare_args +zpool_create_rm_add redundancy3_create_args redundancy3_add_args spare_args From c722bf88125a9378cfc7601b1a8a8ec4933bb430 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Sep 2025 09:35:35 -0700 Subject: [PATCH 03/23] Add interface to interface spa_get_worst_case_min_alloc() function Provide an interface to retrieve the lowest and highest minimum allocation size for the normal allocation class. This can be used by external consumers of the DMU to estimate potential wasted capacity when setting the recordsize for an object. The new "min_alloc" and "max_alloc" keys are added to the pool configuration and used by default_volblocksize() to warn when an ineffecient block size is requested. For older kmods which don't yet include the new keys fallback to the previous logic. Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #17758 --- cmd/zfs/zfs_main.c | 28 +++++++++++++++++++++------- include/sys/fs/zfs.h | 2 ++ include/sys/spa.h | 1 + include/sys/spa_impl.h | 1 + module/zfs/spa_config.c | 2 ++ module/zfs/spa_misc.c | 15 +++++++++++++++ module/zfs/vdev.c | 15 ++++++++------- module/zfs/vdev_label.c | 2 ++ 8 files changed, 52 insertions(+), 14 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 484986bde71..b66440c8f9f 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -930,19 +930,15 @@ zfs_do_clone(int argc, char **argv) } /* - * Return a default volblocksize for the pool which always uses more than - * half of the data sectors. This primarily applies to dRAID which always - * writes full stripe widths. + * Calculate the minimum allocation size based on the top-level vdevs. */ static uint64_t -default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +calculate_volblocksize(nvlist_t *config) { - uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + uint64_t asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; - nvlist_t *config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { @@ -973,6 +969,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) } } + return (asize); +} + +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0) + asize = calculate_volblocksize(config); + /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 49ab9d3db79..662fd81c5ee 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -748,6 +748,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc" +#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 66db16b33c5..39971827073 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1084,6 +1084,7 @@ extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 07a959db344..62b062984d3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -265,6 +265,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_max_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index cf28955b0c5..f615591e826 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -372,6 +372,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6f7c060f97f..a539468df6a 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -806,6 +806,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1864,6 +1865,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } +/* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be @@ -3085,6 +3099,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index fc6d445f978..654e034de9e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1497,12 +1497,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1560,8 +1562,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c44f654b026..0d4fdaa77ba 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -511,6 +511,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, From 26b0f561be8c3b8cf32dcede255be83982f3273f Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Thu, 25 Sep 2025 14:06:28 -0400 Subject: [PATCH 04/23] dnode_next_offset: backtrack if lower level does not match This changes the basic search algorithm from a single search up and down the tree to a full depth-first traversal to handle conditions where the tree matches at a higher level but not a lower level. Normally higher level blocks always point to matching blocks, but there are cases where this does not happen: 1. Racing block pointer updates from dbuf_write_ready. Before f664f1ee7f (#8946), both dbuf_write_ready and dnode_next_offset held dn_struct_rwlock which protected against pointer writes from concurrent syncs. This no longer applies, so sync context can f.e. clear or fill all L1->L0 BPs before the L2->L1 BP and higher BP's are updated. dnode_free_range in particular can reach this case and skip over L1 blocks that need to be dirtied. Later, sync will panic in free_children when trying to clear a non-dirty indirect block. This case was found with ztest. 2. txg > 0, non-hole case. This is #11196. Freeing blocks/dnodes breaks the assumption that a match at a higher level implies a match at a lower level when filtering txg > 0. Whenever some but not all L0 blocks are freed, the parent L1 block is rewritten. Its updated L2->L1 BP reflects a newer birth txg. Later when searching by txg, if the L1 block matches since the txg is newer, it is possible that none of the remaining L1->L0 BPs match if none have been updated. The same behavior is possible with dnode search at L0. This is reachable from dsl_destroy_head for synchronous freeing. When this happens open context fails to free objects leaving sync context stuck freeing potentially many objects. This is also reachable from traverse_pool for extreme rewind where it is theoretically possible that datasets not dirtied after txg are skipped if the MOS has high enough indirection to trigger this case. In both of these cases, without backtracking the search ends prematurely as ESRCH result implies no more matches in the entire object. Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Signed-off-by: Robert Evans Closes #16025 Closes #11196 --- module/zfs/dnode.c | 65 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 6c150d31c66..e88d394b522 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2655,6 +2655,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, return (error); } +/* + * Adjust *offset to the next (or previous) block byte offset at lvl. + * Returns FALSE if *offset would overflow or underflow. + */ +static boolean_t +dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) +{ + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int span = lvl * epbs + dn->dn_datablkshift; + uint64_t blkid, maxblkid; + + if (span >= 8 * sizeof (uint64_t)) + return (B_FALSE); + + blkid = *offset >> span; + maxblkid = 1ULL << (8 * sizeof (*offset) - span); + if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) + *offset = (blkid + 1) << span; + else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) + *offset = (blkid << span) - 1; + else + return (B_FALSE); + + return (B_TRUE); +} + /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find @@ -2682,7 +2708,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t initial_offset = *offset; + uint64_t matched = *offset; int lvl, maxlvl; int error = 0; @@ -2706,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, maxlvl = dn->dn_phys->dn_nlevels; - for (lvl = minlvl; lvl <= maxlvl; lvl++) { + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); - if (error != ESRCH) + if (error == 0 && lvl > minlvl) { + --lvl; + matched = *offset; + } else if (error == ESRCH && lvl < maxlvl && + dnode_next_block(dn, flags, &matched, lvl)) { + /* + * Continue search at next/prev offset in lvl+1 block. + * + * Usually we only search upwards at the start of the + * search as higher level blocks point at a matching + * minlvl block in most cases, but we backtrack if not. + * + * This can happen for txg > 0 searches if the block + * contains only BPs/dnodes freed at that txg. It also + * happens if we are still syncing out the tree, and + * some BP's at higher levels are not updated yet. + * + * We must adjust offset to avoid coming back to the + * same offset and getting stuck looping forever. This + * also deals with the case where offset is already at + * the beginning or end of the object. + */ + ++lvl; + *offset = matched; + } else { break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + } } /* @@ -2727,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, error = 0; } - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); From 5c38029f4bd2bd2753e908d0870e0783088c2310 Mon Sep 17 00:00:00 2001 From: patrickxia Date: Thu, 25 Sep 2025 15:05:42 -0400 Subject: [PATCH 05/23] zdb: add ZFS_KEYFORMAT_RAW support for -K option This change adds support for ZFS_KEYFORMAT_RAW to zdb_derive_key in zdb.c. The implementation reads the raw key from the file specified by the -K option which is consistent with how raw keys are handled in the other parts of ZFS, along with a check to ensure that the keyfile doesn't have too many bytes. Reviewed-by: Brian Behlendorf Signed-off-by: Patrick Xia Closes #17783 --- cmd/zdb/zdb.c | 20 +++++ tests/runfiles/common.run | 8 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zdb/zdb_encrypted_raw.ksh | 75 +++++++++++++++++++ 4 files changed, 100 insertions(+), 4 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d655fa715e1..70a4ed46f26 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3301,6 +3301,7 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) uint64_t keyformat, salt, iters; int i; unsigned char c; + FILE *f; VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), @@ -3333,6 +3334,25 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) break; + case ZFS_KEYFORMAT_RAW: + if ((f = fopen(key_material, "r")) == NULL) + return (B_FALSE); + + if (fread(key_out, 1, WRAPPING_KEY_LEN, f) != + WRAPPING_KEY_LEN) { + (void) fclose(f); + return (B_FALSE); + } + + /* Check the key length */ + if (fgetc(f) != EOF) { + (void) fclose(f); + return (B_FALSE); + } + + (void) fclose(f); + break; + default: fatal("no support for key format %u\n", (unsigned int) keyformat); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index d1be502f160..36a56ddee58 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -168,10 +168,10 @@ tags = ['functional', 'cli_root', 'zinject'] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', - 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', - 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup', - 'zdb_tunables'] + 'zdb_display_block', 'zdb_encrypted', 'zdb_encrypted_raw', + 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', + 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', + 'zdb_backup', 'zdb_tunables'] pre = post = tags = ['functional', 'cli_root', 'zdb'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 2ccc58981bd..ec5da0defa4 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -640,6 +640,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_decompress_zstd.ksh \ functional/cli_root/zdb/zdb_display_block.ksh \ functional/cli_root/zdb/zdb_encrypted.ksh \ + functional/cli_root/zdb/zdb_encrypted_raw.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh new file mode 100755 index 00000000000..85d267d5402 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted_raw.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib + +# +# DESCRIPTION: +# 'zdb -K ...' should enable reading from a raw-encrypted dataset +# +# STRATEGY: +# 1. Create an encrypted dataset +# 2. Write some data to a file +# 3. Run zdb -dddd on the file, confirm it can't be read +# 4. Run zdb -K ... -ddddd on the file, confirm it can be read +# + +verify_runnable "both" + +dataset="$TESTPOOL/$TESTFS2" +file="$TESTDIR2/somefile" +keyfile="$TEST_BASE_DIR/keyfile" + +function cleanup +{ + datasetexists "$dataset" && destroy_dataset "$dataset" -f + rm -f "$keyfile" + default_cleanup_noexit +} + +log_onexit cleanup + +log_must default_setup_noexit $DISKS + +log_assert "'zdb -K' should enable reading from a raw-encrypted dataset" + +# The key must be 32 bytes long. +echo -n "$RAWKEY" > "$keyfile" + +log_must zfs create -o mountpoint="$TESTDIR2" \ + -o encryption=on -o keyformat=raw -o keylocation="file://$keyfile" \ + "$dataset" + +echo 'my great encrypted text' > "$file" + +typeset -i obj=$(ls -i "$file" | cut -d' ' -f1) +typeset -i size=$(wc -c < "$file") + +log_note "test file $file is objid $obj, size $size" + +sync_pool "$TESTPOOL" true + +log_must eval "zdb -dddd $dataset $obj | grep -q 'object encrypted'" + +log_must eval "zdb -K $keyfile -dddd $dataset $obj | grep -q 'size\s$size$'" + +log_pass "'zdb -K' enables reading from a raw-encrypted dataset" From aecd6deeb3ba2522c2dfac8e22b1787c73a54e74 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Sep 2025 17:47:32 -0700 Subject: [PATCH 06/23] CI: update perf and bpftools with the kernel packages When updating a Fedora instance to an experimental kernel make sure to include the matching versioned perf and bpftool packages. This helps ensure there are no unexpected conflicts which would prevent the new packages from being installed. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #17791 --- .github/workflows/scripts/qemu-3-deps-vm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh index ee058b48808..4a7e724586f 100755 --- a/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -104,7 +104,7 @@ function install_fedora_experimental_kernel { our_version="$1" sudo dnf -y copr enable @kernel-vanilla/stable sudo dnf -y copr enable @kernel-vanilla/mainline - all="$(sudo dnf list --showduplicates kernel-*)" + all="$(sudo dnf list --showduplicates kernel-* python3-perf* perf* bpftool*)" echo "Available versions:" echo "$all" From 79be201806fd772b42130d95a5dc85961f83e1df Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 26 Sep 2025 10:00:18 -0700 Subject: [PATCH 07/23] Linux 6.17 compat: META Update the META file to reflect compatibility with the 6.17 kernel. Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Brian Behlendorf Closes #17789 --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 5704b5c6de8..bdb7aee4804 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.16 +Linux-Maximum: 6.17 Linux-Minimum: 4.18 From a44985315ef534724a4c2c56e13177cfe68e3a33 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 26 Sep 2025 15:32:41 -0700 Subject: [PATCH 08/23] CI: Remove Buildbot references The Buildbot CI infrastructure has been fully replaced by GitHub Actions. Remove any lingering references from the repository. Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #17794 --- .github/PULL_REQUEST_TEMPLATE.md | 5 ----- contrib/pyzfs/libzfs_core/test/test_libzfs_core.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 79809179cf1..47edc817460 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,11 +2,6 @@ - - ### Motivation and Context diff --git a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py index 971aa1d0d49..bad1af2d167 100644 --- a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py @@ -4223,7 +4223,7 @@ def reset(self): self.getRoot().reset() return - # On the Buildbot builders this may fail with "pool is busy" + # On the CI builders this may fail with "pool is busy" # Retry 5 times before raising an error retry = 0 while True: From 4ff25e9013790c20e21b42b2d750a52c6e58d54e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 26 Sep 2025 17:52:57 -0700 Subject: [PATCH 09/23] CI: Switch FreeBSD 15 to 15.0-ALPHA3 Signed-off-by: Brian Behlendorf Reviewed-by: Alexander Motin Closes #17795 --- .github/workflows/scripts/qemu-2-start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 8439942c5a4..1c608348ffc 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -121,7 +121,7 @@ case "$OS" in KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - FreeBSD="15.0-ALPHA2" + FreeBSD="15.0-ALPHA3" OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" From 8d4c3ee9e6f4919db7d4bd9eba7322356fbe1060 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 29 Sep 2025 16:29:20 -0700 Subject: [PATCH 10/23] zvol: Fix blk-mq sync The zvol blk-mq codepaths would erroneously send FLUSH and TRIM commands down the read codepath, rather than write. This fixes the issue, and updates the zvol_misc_fua test to verify that sync writes are actually happening. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Tony Hutter Closes #17761 Closes #17765 --- include/os/linux/kernel/linux/blkdev_compat.h | 18 --------- module/os/linux/zfs/zvol_os.c | 23 ++++++++++- .../zvol/zvol_misc/zvol_misc_fua.ksh | 40 ++++++++++++++++++- 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 076dab8ba6d..214f3ea0e78 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ -/* - * All the io_*() helper functions below can operate on a bio, or a rq, but - * not both. The older submit_bio() codepath will pass a bio, and the - * newer blk-mq codepath will pass a rq. - */ -static inline int -io_data_dir(struct bio *bio, struct request *rq) -{ - if (rq != NULL) { - if (op_is_write(req_op(rq))) { - return (WRITE); - } else { - return (READ); - } - } - return (bio_data_dir(bio)); -} - static inline int io_is_flush(struct bio *bio, struct request *rq) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index bac166fcd89..967a018640e 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -484,7 +484,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (rq != NULL) { + /* + * Flush & trim requests go down the zvol_write codepath. Or + * more specifically: + * + * If request is a write, or if it's op_is_sync() and not a + * read, or if it's a flush, or if it's a discard, then send the + * request down the write path. + */ + if (op_is_write(rq->cmd_flags) || + (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || + req_op(rq) == REQ_OP_FLUSH || + op_is_discard(rq->cmd_flags)) { + rw = WRITE; + } else { + rw = READ; + } + } else { + rw = bio_data_dir(bio); + } if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 571a698eb63..502ebada22d 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -50,17 +50,53 @@ fi typeset datafile1="$(mktemp -t zvol_misc_fua1.XXXXXX)" typeset datafile2="$(mktemp -t zvol_misc_fua2.XXXXXX)" +typeset datafile3="$(mktemp -t zvol_misc_fua3_log.XXXXXX)" typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL +typeset DISK1=${DISKS%% *} function cleanup { - rm "$datafile1" "$datafile2" + log_must zpool remove $TESTPOOL $datafile3 + rm "$datafile1" "$datafile2" "$datafile2" +} + +# Prints the total number of sync writes for a vdev +# $1: vdev +function get_sync +{ + zpool iostat -p -H -v -r $TESTPOOL $1 | \ + awk '/[0-9]+$/{s+=$4+$5} END{print s}' } function do_test { # Wait for udev to create symlinks to our zvol block_device_wait $zvolpath + # Write using sync (creates FLUSH calls after writes, but not FUA) + old_vdev_writes=$(get_sync $DISK1) + old_log_writes=$(get_sync $datafile3) + + log_must fio --name=write_iops --size=5M \ + --ioengine=libaio --verify=0 --bs=4K \ + --iodepth=1 --rw=randwrite --group_reporting=1 \ + --filename=$zvolpath --sync=1 + + vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes)) + log_writes=$(( $(get_sync $datafile3) - $old_log_writes)) + + # When we're doing sync writes, we should see many more writes go to + # the log vs the first vdev. Experiments show anywhere from a 160-320x + # ratio of writes to the log vs the first vdev (due to some straggler + # writes to the first vdev). + # + # Check that we have a large ratio (100x) of sync writes going to the + # log device + ratio=$(($log_writes / $vdev_writes)) + log_note "Got $log_writes log writes, $vdev_writes vdev writes." + if [ $ratio -lt 100 ] ; then + log_fail "Expected > 100x more log writes than vdev writes. " + fi + # Create a data file log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5 @@ -81,6 +117,8 @@ log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)" log_onexit cleanup log_must zfs set compression=off $TESTPOOL/$TESTVOL +log_must truncate -s 100M $datafile3 +log_must zpool add $TESTPOOL log $datafile3 log_note "Testing without blk-mq" From 75be5f2973b308d24fe1583a1221b8bae9b6342d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 29 Sep 2025 16:32:05 -0700 Subject: [PATCH 11/23] CI: Add ZTS -O option, log Setup Testing Machines step Add a -O option to zfs-test.sh to dump debug information on test timeout. The debug info includes: - 30 lines from 'top' - /proc//stack output of process with highest CPU usage - Last lines strace-ing process with highest CPU usage - /proc/sysrq-trigger kernel stack traces All debug information gets dumped to /dev/kmsg (Linux only). In addition, print out the VM console lines from the "Setup Testing Machines" step. We have often see VMs timeout at this step and don't know why. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #17753 --- .github/workflows/scripts/qemu-3-deps-vm.sh | 9 +-- .github/workflows/scripts/qemu-5-setup.sh | 25 ++++++--- .github/workflows/scripts/qemu-6-tests.sh | 2 +- scripts/zfs-tests.sh | 9 ++- tests/test-runner/bin/test-runner.py.in | 61 +++++++++++++++++++++ 5 files changed, 93 insertions(+), 13 deletions(-) diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh index 4a7e724586f..f67bb2f68e9 100755 --- a/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -20,7 +20,7 @@ function archlinux() { sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \ fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \ parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \ - samba sysstat rng-tools rsync wget xxhash + samba strace sysstat rng-tools rsync wget xxhash echo "##[endgroup]" } @@ -43,7 +43,8 @@ function debian() { lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \ python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \ python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \ - rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev + rsync samba strace sysstat uuid-dev watchdog wget xfslibs-dev xxhash \ + zlib1g-dev echo "##[endgroup]" } @@ -87,8 +88,8 @@ function rhel() { libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \ parted perf python3 python3-cffi python3-devel python3-packaging \ kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \ - rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \ - zlib-devel + rpm-build rsync samba strace sysstat systemd watchdog wget xfsprogs-devel \ + xxhash zlib-devel echo "##[endgroup]" } diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh index 0adcad2a99b..4869c1003e4 100755 --- a/.github/workflows/scripts/qemu-5-setup.sh +++ b/.github/workflows/scripts/qemu-5-setup.sh @@ -108,19 +108,30 @@ echo '*/5 * * * * /root/cronjob.sh' > crontab.txt sudo crontab crontab.txt rm crontab.txt -# check if the machines are okay -echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)" -for ((i=1; i<=VMs; i++)); do - .github/workflows/scripts/qemu-wait-for-vm.sh vm$i -done -echo "All $VMs VMs are up now." - # Save the VM's serial output (ttyS0) to /var/tmp/console.txt # - ttyS0 on the VM corresponds to a local /dev/pty/N entry # - use 'virsh ttyconsole' to lookup the /dev/pty/N entry for ((i=1; i<=VMs; i++)); do mkdir -p $RESPATH/vm$i read "pty" <<< $(sudo virsh ttyconsole vm$i) + + # Create the file so we can tail it, even if there's no output. + touch $RESPATH/vm$i/console.txt + sudo nohup bash -c "cat $pty > $RESPATH/vm$i/console.txt" & + + # Write all VM boot lines to the console to aid in debugging failed boots. + # The boot lines from all the VMs will be munged together, so prepend each + # line with the vm hostname (like 'vm1:'). + (while IFS=$'\n' read -r line; do echo "vm$i: $line" ; done < <(sudo tail -f $RESPATH/vm$i/console.txt)) & + done echo "Console logging for ${VMs}x $OS started." + + +# check if the machines are okay +echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)" +for ((i=1; i<=VMs; i++)); do + .github/workflows/scripts/qemu-wait-for-vm.sh vm$i +done +echo "All $VMs VMs are up now." diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh index 5ab822f4f07..ca6ac77f146 100755 --- a/.github/workflows/scripts/qemu-6-tests.sh +++ b/.github/workflows/scripts/qemu-6-tests.sh @@ -111,7 +111,7 @@ fi sudo dmesg -c > dmesg-prerun.txt mount > mount.txt df -h > df-prerun.txt -$TDIR/zfs-tests.sh -vK -s 3GB -T $TAGS +$TDIR/zfs-tests.sh -vKO -s 3GB -T $TAGS RV=$? df -h > df-postrun.txt echo $RV > tests-exitcode.txt diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 04f3b6f32cb..5a0a1a60944 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -38,6 +38,7 @@ DEBUG="" CLEANUP="yes" CLEANUPALL="no" KMSG="" +TIMEOUT_DEBUG="" LOOPBACK="yes" STACK_TRACER="no" FILESIZE="4G" @@ -364,6 +365,7 @@ OPTIONS: -k Disable cleanup after test failure -K Log test names to /dev/kmsg -f Use files only, disables block device tests + -O Dump debugging info to /dev/kmsg on test timeout -S Enable stack tracer (negative performance impact) -c Only create and populate constrained path -R Automatically rerun failing tests @@ -402,7 +404,7 @@ $0 -x EOF } -while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do +while getopts 'hvqxkKfScRmOn:d:Ds:r:?t:T:u:I:' OPTION; do case $OPTION in h) usage @@ -445,6 +447,9 @@ while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do export NFS=1 . "$nfsfile" ;; + O) + TIMEOUT_DEBUG="yes" + ;; d) FILEDIR="$OPTARG" ;; @@ -773,6 +778,7 @@ msg "${TEST_RUNNER}" \ "${DEBUG:+-D}" \ "${KMEMLEAK:+-m}" \ "${KMSG:+-K}" \ + "${TIMEOUT_DEBUG:+-O}" \ "-c \"${RUNFILES}\"" \ "-T \"${TAGS}\"" \ "-i \"${STF_SUITE}\"" \ @@ -783,6 +789,7 @@ msg "${TEST_RUNNER}" \ ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ ${KMSG:+-K} \ + ${TIMEOUT_DEBUG:+-O} \ -c "${RUNFILES}" \ -T "${TAGS}" \ -i "${STF_SUITE}" \ diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index 2158208be6e..d2c1185e4a9 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -34,6 +34,7 @@ from select import select from subprocess import PIPE from subprocess import Popen from subprocess import check_output +from subprocess import run from threading import Timer from time import time, CLOCK_MONOTONIC from os.path import exists @@ -187,6 +188,63 @@ User: %s ''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user) def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False): + + """ + We're about to kill a command due to a timeout. + If we're running with the -O option, then dump debug info about the + process with the highest CPU usage to /dev/kmsg (Linux only). This can + help debug the timeout. + + Debug info includes: + - 30 lines from 'top' + - /proc//stack output of process with highest CPU usage + - Last lines strace-ing process with highest CPU usage + """ + if exists("/dev/kmsg"): + c = """ +TOP_OUT="$(COLUMNS=160 top -b -n 1 | head -n 30)" +read -r PID CMD <<< $(echo "$TOP_OUT" | /usr/bin/awk \ +"/COMMAND/{ + print_next=1 + next +} +{ + if (print_next == 1) { + print \\$1\\" \\"\\$12 + exit + } +}") +echo "##### ZTS timeout debug #####" +echo "----- top -----" +echo "$TOP_OUT" +echo "----- /proc/$PID/stack ($CMD)) -----" +cat /proc/$PID/stack +echo "----- strace ($CMD) -----" +TMPFILE="$(mktemp --suffix=ZTS)" +/usr/bin/strace -k --stack-traces -p $PID &> "$TMPFILE" & +sleep 0.1 +killall strace +tail -n 30 $TMPFILE +rm "$TMPFILE" +echo "##### /proc/sysrq-trigger stack #####" +""" + c = "sudo bash -c '" + c + "'" + data = run(c, capture_output=True, shell=True, text=True) + out = data.stdout + try: + kp = Popen([SUDO, "sh", "-c", + "echo '" + out + "' > /dev/kmsg"]) + kp.wait() + + """ + Trigger kernel stack traces + """ + kp = Popen([SUDO, "sh", "-c", + "echo l > /proc/sysrq-trigger"]) + kp.wait() + except Exception: + pass + """ Kill a running command due to timeout, or ^C from the keyboard. If sudo is required, this user was verified previously. @@ -1129,6 +1187,9 @@ def parse_args(): parser.add_option('-o', action='callback', callback=options_cb, default=BASEDIR, dest='outputdir', type='string', metavar='outputdir', help='Specify an output directory.') + parser.add_option('-O', action='store_true', default=False, + dest='timeout_debug', + help='Dump debugging info to /dev/kmsg on test timeout') parser.add_option('-i', action='callback', callback=options_cb, default=TESTDIR, dest='testdir', type='string', metavar='testdir', help='Specify a test directory.') From f0a95e897162ed97a651ebbc630284885f26b01e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Sep 2025 09:35:27 +1000 Subject: [PATCH 12/23] zpool iostat: refresh pool list every interval When running zpool iostat in interval mode, it would not notice any new pools created or imported, and would forget any destroyed or exported, so would not notice if they came back. This leads to outputting "no pools available" every interval until killed. It looks like this was at least intended to work; the comment above zpool_do_iostat() indicates that it is expected to "deal with pool creation/destruction" and that pool_list_update() would detect new pools. That call however was removed in 3e43edd2c5, though its unclear if that broke this behaviour and it wasn't noticed, or if it never worked, or if something later broke it. That said, the lack of pool_list_update() is only part of the reason it doesn't work properly. The fundamental problem is that the various things involved in refreshing or updating the list of pools would aggressively ignore, remove, skip or fail on pools that stop existing, or that already exist. Mostly this meant that once a pool is removed from the list, it will never be seen again. Restoring pool_list_update() to the zpool_do_iostat() loop only partially fixes this - it would find "new" pools again, but only in the "all pools" (no args) mode, and because its iterator callback add_pool() would abort the iterator if it already has a pool listed, it would only add pools if there weren't any already. So, this commit reworks the structure somewhat. pool_list_update() becomes pool_list_refresh(), and will ensure the state of all pools in the list are updated. In the "all pools" mode, it will also add new pools and remove pools that disappear, but when a fixed list of pools is used, the list doesn't change, only the state of the pools within it. The rest of the commit is adjusting things for this much simpler structure. Regardless of the mode in use, pool_list_refresh() will always do the right thing, so the driver code can just get on with the display. Now that pools can appear and disappear, I've made it so the header (if enabled) is re-printed when the list changes, so that its easier to see what's happening if the column widths change. Since this is all rather complicated, I've included tests for the "all pools" and "set of pools" modes. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Igor Kozhukhov Signed-off-by: Rob Norris Closes #17786 --- cmd/zpool/zpool_iter.c | 125 +++++++--- cmd/zpool/zpool_main.c | 54 ++-- cmd/zpool/zpool_util.h | 3 +- tests/runfiles/common.run | 4 + tests/zfs-tests/tests/Makefile.am | 5 + .../cli_root/zpool_iostat/cleanup.ksh | 30 +++ .../cli_root/zpool_iostat/setup.ksh | 32 +++ .../cli_root/zpool_iostat/zpool_iostat.kshlib | 235 ++++++++++++++++++ .../zpool_iostat_interval_all.ksh | 90 +++++++ .../zpool_iostat_interval_some.ksh | 80 ++++++ 10 files changed, 588 insertions(+), 70 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 2eec9a95e24..16d4fe5eb1c 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -26,6 +26,7 @@ /* * Copyright 2016 Igor Kozhukhov . + * Copyright (c) 2025, Klara, Inc. */ #include @@ -52,7 +53,7 @@ typedef struct zpool_node { zpool_handle_t *zn_handle; uu_avl_node_t zn_avlnode; - int zn_mark; + hrtime_t zn_last_refresh; } zpool_node_t; struct zpool_list { @@ -62,6 +63,7 @@ struct zpool_list { uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; zfs_type_t zl_type; + hrtime_t zl_last_refresh; }; static int @@ -81,32 +83,47 @@ zpool_compare(const void *larg, const void *rarg, void *unused) * of known pools. */ static int -add_pool(zpool_handle_t *zhp, void *data) +add_pool(zpool_handle_t *zhp, zpool_list_t *zlp) { - zpool_list_t *zlp = data; - zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); + zpool_node_t *node, *new = safe_malloc(sizeof (zpool_node_t)); uu_avl_index_t idx; - node->zn_handle = zhp; - uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); - if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { + new->zn_handle = zhp; + uu_avl_node_init(new, &new->zn_avlnode, zlp->zl_pool); + + node = uu_avl_find(zlp->zl_avl, new, NULL, &idx); + if (node == NULL) { if (zlp->zl_proplist && zpool_expand_proplist(zhp, zlp->zl_proplist, zlp->zl_type, zlp->zl_literal) != 0) { zpool_close(zhp); - free(node); + free(new); return (-1); } - uu_avl_insert(zlp->zl_avl, node, idx); + new->zn_last_refresh = zlp->zl_last_refresh; + uu_avl_insert(zlp->zl_avl, new, idx); } else { + node->zn_last_refresh = zlp->zl_last_refresh; zpool_close(zhp); - free(node); + free(new); return (-1); } return (0); } +/* + * add_pool(), but always returns 0. This allows zpool_iter() to continue + * even if a pool exists in the tree, or we fail to get the properties for + * a new one. + */ +static int +add_pool_cb(zpool_handle_t *zhp, void *data) +{ + (void) add_pool(zhp, data); + return (0); +} + /* * Create a list of pools based on the given arguments. If we're given no * arguments, then iterate over all pools in the system and add them to the AVL @@ -135,9 +152,10 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, zlp->zl_type = type; zlp->zl_literal = literal; + zlp->zl_last_refresh = gethrtime(); if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); + (void) zpool_iter(g_zfs, add_pool_cb, zlp); zlp->zl_findall = B_TRUE; } else { int i; @@ -159,15 +177,69 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, } /* - * Search for any new pools, adding them to the list. We only add pools when no - * options were given on the command line. Otherwise, we keep the list fixed as - * those that were explicitly specified. + * Refresh the state of all pools on the list. Additionally, if no options were + * given on the command line, add any new pools and remove any that are no + * longer available. */ -void -pool_list_update(zpool_list_t *zlp) +int +pool_list_refresh(zpool_list_t *zlp) { - if (zlp->zl_findall) - (void) zpool_iter(g_zfs, add_pool, zlp); + zlp->zl_last_refresh = gethrtime(); + + if (!zlp->zl_findall) { + /* + * This list is a fixed list of pools, so we must not add + * or remove any. Just walk over them and refresh their + * state. + */ + int navail = 0; + for (zpool_node_t *node = uu_avl_first(zlp->zl_avl); + node != NULL; node = uu_avl_next(zlp->zl_avl, node)) { + boolean_t missing; + zpool_refresh_stats(node->zn_handle, &missing); + navail += !missing; + node->zn_last_refresh = zlp->zl_last_refresh; + } + return (navail); + } + + /* + * Search for any new pools and add them to the list. zpool_iter() + * will call zpool_refresh_stats() as part of its work, so this has + * the side effect of updating all active handles. + */ + (void) zpool_iter(g_zfs, add_pool_cb, zlp); + + /* + * Walk the list for any that weren't refreshed, and update and remove + * them. It's not enough to just skip available ones, as zpool_iter() + * won't update them, so they'll still appear active in our list. + */ + zpool_node_t *node, *next; + for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next) { + next = uu_avl_next(zlp->zl_avl, node); + + /* + * Skip any that were refreshed and are online; they're already + * handled. + */ + if (node->zn_last_refresh == zlp->zl_last_refresh && + zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL) + continue; + + /* Do the refresh ourselves, just in case. */ + boolean_t missing; + zpool_refresh_stats(node->zn_handle, &missing); + if (missing) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } else { + node->zn_last_refresh = zlp->zl_last_refresh; + } + } + + return (uu_avl_numnodes(zlp->zl_avl)); } /* @@ -190,23 +262,6 @@ pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, return (ret); } -/* - * Remove the given pool from the list. When running iostat, we want to remove - * those pools that no longer exist. - */ -void -pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) -{ - zpool_node_t search, *node; - - search.zn_handle = zhp; - if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { - uu_avl_remove(zlp->zl_avl, node); - zpool_close(node->zn_handle); - free(node); - } -} - /* * Free all the handles associated with this list. */ diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 2c46ad0df89..286336cf273 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -33,7 +33,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2021, 2023, 2025, Klara, Inc. * Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP. */ @@ -5761,24 +5761,6 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, return (ret); } -static int -refresh_iostat(zpool_handle_t *zhp, void *data) -{ - iostat_cbdata_t *cb = data; - boolean_t missing; - - /* - * If the pool has disappeared, remove it from the list and continue. - */ - if (zpool_refresh_stats(zhp, &missing) != 0) - return (-1); - - if (missing) - pool_list_remove(cb->cb_list, zhp); - - return (0); -} - /* * Callback to print out the iostats for the given pool. */ @@ -6359,15 +6341,14 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data) * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely - * on pool_list_update() to detect the addition of new pools. Configuration - * changes are all handled within libzfs. + * on pool_list_refresh() to detect the addition and removal of pools. + * Configuration changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; - int npools; float interval = 0; unsigned long count = 0; zpool_list_t *list; @@ -6618,10 +6599,24 @@ zpool_do_iostat(int argc, char **argv) return (1); } + int last_npools = 0; for (;;) { - if ((npools = pool_list_count(list)) == 0) + /* + * Refresh all pools in list, adding or removing pools as + * necessary. + */ + int npools = pool_list_refresh(list); + if (npools == 0) { (void) fprintf(stderr, gettext("no pools available\n")); - else { + } else { + /* + * If the list of pools has changed since last time + * around, reset the iteration count to force the + * header to be redisplayed. + */ + if (last_npools != npools) + cb.cb_iteration = 0; + /* * If this is the first iteration and -y was supplied * we skip any printing. @@ -6629,15 +6624,6 @@ zpool_do_iostat(int argc, char **argv) boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); - /* - * Refresh all statistics. This is done as an - * explicit step before calculating the maximum name - * width, so that any * configuration changes are - * properly accounted for. - */ - (void) pool_list_iter(list, B_FALSE, refresh_iostat, - &cb); - /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. @@ -6728,6 +6714,8 @@ zpool_do_iostat(int argc, char **argv) (void) fflush(stdout); (void) fsleep(interval); + + last_npools = npools; } pool_list_free(list); diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index 5ab7cb9750f..3af23c52bd4 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -76,11 +76,10 @@ typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, zfs_type_t, boolean_t, int *); -void pool_list_update(zpool_list_t *); +int pool_list_refresh(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); int pool_list_count(zpool_list_t *); -void pool_list_remove(zpool_list_t *, zpool_handle_t *); extern libzfs_handle_t *g_zfs; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 36a56ddee58..b0e8f8d4307 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -491,6 +491,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 +[tests/functional/cli_root/zpool_iostat] +tests = ['zpool_iostat_interval_all', 'zpool_iostat_interval_some'] +tags = ['functional', 'cli_root', 'zpool_iostat'] + [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ec5da0defa4..43eaa45910b 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -197,6 +197,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \ functional/cli_root/zpool_import/zpool_import.cfg \ functional/cli_root/zpool_import/zpool_import.kshlib \ + functional/cli_root/zpool_iostat/zpool_iostat.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ @@ -1181,6 +1182,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \ functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \ + functional/cli_root/zpool_iostat/setup.ksh \ + functional/cli_root/zpool_iostat/cleanup.ksh \ + functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh \ + functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh new file mode 100755 index 00000000000..099b5426031 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# +# +. $STF_SUITE/include/libtest.shlib + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh new file mode 100755 index 00000000000..3529a0ccc01 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# +# +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib new file mode 100644 index 00000000000..ea4b0bd2756 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib @@ -0,0 +1,235 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# Since we want to make sure that iostat responds correctly as pools appear and +# disappear, we run it in the background and capture its output to a file. +# Once we're done, we parse the output and ensure it matches what we'd expect +# from the operations we performed. +# +# Because iostat is producing output every interval, it may produce the "same" +# output for each step of the change; in fact, we want that to make sure we +# don't miss anything. So, we describe what we expect as a series of "chunks". +# Each chunk is a particular kind of output, which may repeat. Current known +# chunk types are: +# +# NOPOOL: the text "no pools available" +# HEADER: three lines, starting with "capacity", "pool" and "----" respectively. +# (the rough shape of the normal iostat header). +# POOL1: a line starting with "pool1" (stats line for a pool of that name) +# POOL2: a line starting with "pool2" +# POOLBOTH: three lines, starting with "pool1", "pool2" (either order) and +# "-----" respectively. (the pool stat output for multiple pools) +# +# (the parser may produce other chunks in a failed parse to assist with +# debugging, but they should never be part of the "wanted" output See the +# parser commentary below). +# +# To help recognise the start of a new interval output, we run iostat with the +# -T u option, which will output a numeric timestamp before each header or +# second-or-later pool stat after the header. +# +# To keep the test run shorter, we use a subsecond interval, but to make sure +# nothing is missed, we sleep for three intervals after each change. + +typeset _iostat_out=$(mktemp) +typeset _iostat_pid="" + +function cleanup_iostat { + if [[ -n $_iostat_pid ]] ; then + kill -KILL $_iostat_pid || true + fi + rm -f $_iostat_out +} + +function start_iostat { + zpool iostat -T u $@ 0.1 > $_iostat_out 2>&1 & + _iostat_pid=$! +} + +function stop_iostat { + kill -TERM $_iostat_pid + wait $_iostat_pid + _iostat_pid="" +} + +function delay_iostat { + sleep 0.3 +} + +typeset -a _iostat_expect +function expect_iostat { + typeset chunk=$1 + _iostat_expect+=($chunk) +} + +# Parse the output The `state` var is used to track state across +# multiple lines. The `last` var and the `_got_iostat` function are used +# to record the completed chunks, and to collapse repetitions. +typeset -a _iostat_got +typeset _iostat_last="" +typeset _iostat_state="" + +function _got_iostat { + typeset chunk=$1 + if [[ -n $chunk && $_iostat_last != $chunk ]] ; then + _iostat_last=$chunk + _iostat_got+=($chunk) + fi + _iostat_state="" +} + +function verify_iostat { + + cat $_iostat_out | while read line ; do + + # The "no pools available" text has no timestamp or other + # header, and should never appear in the middle of multiline + # chunk, so we can close any in-flight state. + if [[ $line = "no pools available" ]] ; then + _got_iostat $_iostat_state + _got_iostat "NOPOOL" + continue + fi + + # A run of digits alone on the line is a timestamp (the `-T u` + # switch to `iostat`). It closes any in-flight state as a + # complete chunk, and indicates the start of a new chunk. + if [[ -z ${line/#+([0-9])/} ]] ; then + _got_iostat $_iostat_state + _iostat_state="TIMESTAMP" + continue + fi + + # For this test, the first word of each line should be unique, + # so we extract it and use it for simplicity. + typeset first=${line%% *} + + # Header is emitted whenever the pool list changes. It has + # three lines: + # + # capacity operations bandwidth + # pool alloc free read write read write + # ---------- ----- ----- ----- ----- ----- ----- + # + # Each line moves the state; when we get to a run of dashes, we + # commit. Note that we check for one-or-more dashes, because + # the width can vary depending on the length of pool name. + # + if [[ $_iostat_state = "TIMESTAMP" && + $first = "capacity" ]] ; then + _iostat_state="INHEADER1" + continue + fi + if [[ $_iostat_state = "INHEADER1" && + $first = "pool" ]] ; then + _iostat_state="INHEADER2" + continue + fi + if [[ $_iostat_state = "INHEADER2" && + -z ${first/#+(-)/} ]] ; then + # Headers never repeat, so if the last committed chunk + # was a header, we commit this one as EXTRAHEADER so we + # can see it in the error output. + if [[ $_iostat_last = "HEADER" ]] ; then + _got_iostat "EXTRAHEADER" + elif [[ $_iostat_last != "EXTRAHEADER" ]] ; then + _got_iostat "HEADER" + fi + _iostat_state="HEADER" + continue + fi + + # A pool stat line looks like: + # + # pool1 147K 240M 0 0 0 0 + # + # If there are multiple pools, iostat follows them with a + # separator of dashed lines: + # + # pool1 147K 240M 0 0 0 0 + # pool2 147K 240M 0 0 0 0 + # ---------- ----- ----- ----- ----- ----- ----- + # + # Stats rows always start after a timestamp or a header. If the + # header was emitted, we won't see a timestamp here (it goes + # before the header). + # + # Because our test exercises both pools on their own and + # together, we allow pools in either order. In practice they + # are sorted, but that's a side-effect of the implementation + # (see zpool_compare()), so we're not going to rely on it here. + if [[ $first = "pool1" ]] || [[ $first = "pool2" ]] ; then + + # First line, track which one we saw. If it's a + # standalone line, it will be committed by the next + # NOPOOL or TIMESTAMP above (or the `_got_iostat` after + # the loop if this is the last line). + if [[ $_iostat_state == "TIMESTAMP" || + $_iostat_state == "HEADER" ]] ; then + if [[ $first = "pool1" ]] ; then + _iostat_state="POOL1" + elif [[ $first = "pool2" ]] ; then + _iostat_state="POOL2" + fi + continue + fi + + # If this is the second pool, we're in a multi-pool + # block, and need to look for the separator to close it + # out. + if [[ $_iostat_state = "POOL1" && $first = "pool2" ]] || + [[ $_iostat_state = "POOL2" && $first = "pool1" ]] ; + then + _iostat_state="INPOOLBOTH" + continue + fi + fi + + # Separator after the stats block. + if [[ $_iostat_state = "INPOOLBOTH" && + -z ${first/#+(-)/} ]] ; then + _got_iostat "POOLBOTH" + continue + fi + + # Anything else will fall through to here. We commit any + # in-flight state, then "UNKNOWN", all to help with debugging.. + if [[ $_iostat_state != "UNKNOWN" ]] ; then + _got_iostat $_iostat_state + _got_iostat "UNKNOWN" + fi + done + + # Close out any remaining state. + _got_iostat $_iostat_state + + # Compare what we wanted with what we got, and pass/fail the test! + if [[ "${_iostat_expect[*]}" != "${_iostat_got[*]}" ]] ; then + log_note "expected: ${_iostat_expect[*]}" + log_note " got: ${_iostat_got[*]}" + log_fail "zpool iostat did not produce expected output" + fi +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh new file mode 100755 index 00000000000..8e040058ec3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# `zpool iostat ` should keep running and update the pools it displays as +# pools are created/destroyed/imported/export. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib + +typeset vdev1=$(mktemp) +typeset vdev2=$(mktemp) + +function cleanup { + cleanup_iostat + + poolexists pool1 && destroy_pool pool1 + poolexists pool2 && destroy_pool pool2 + rm -f $vdev1 $vdev2 +} + +log_must mkfile $MINVDEVSIZE $vdev1 $vdev2 + +expect_iostat "NOPOOL" + +start_iostat + +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL1" +log_must zpool create pool1 $vdev1 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOLBOTH" +log_must zpool create pool2 $vdev2 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool export -a +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL2" +log_must zpool import -d $vdev2 pool2 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOLBOTH" +log_must zpool import -d $vdev1 pool1 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL2" +log_must zpool destroy pool1 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool destroy pool2 +delay_iostat + +stop_iostat + +verify_iostat + +log_pass "zpool iostat in interval mode follows pool updates" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh new file mode 100755 index 00000000000..ab1f258aa1c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# `zpool iostat ` should keep running and only show the listed pools. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_iostat/zpool_iostat.kshlib + +typeset vdev1=$(mktemp) +typeset vdev2=$(mktemp) + +function cleanup { + cleanup_iostat + + poolexists pool1 && destroy_pool pool1 + poolexists pool2 && destroy_pool pool2 + rm -f $vdev1 $vdev2 +} + +log_must mkfile $MINVDEVSIZE $vdev1 $vdev2 + +log_must zpool create pool1 $vdev1 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL1" +start_iostat pool1 +delay_iostat + +log_must zpool create pool2 $vdev2 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool export -a +delay_iostat + +log_must zpool import -d $vdev2 pool2 +delay_iostat + +expect_iostat "HEADER" +expect_iostat "POOL1" +log_must zpool import -d $vdev1 pool1 +delay_iostat + +expect_iostat "NOPOOL" +log_must zpool destroy pool1 +delay_iostat + +log_must zpool destroy pool2 +delay_iostat + +stop_iostat + +verify_iostat + +log_pass "zpool iostat in interval mode with pools follows listed pool updates" From e4a407f29f412e92c1a37b8289fbf1487c18bb18 Mon Sep 17 00:00:00 2001 From: hoshinomori Date: Tue, 30 Sep 2025 07:38:52 +0800 Subject: [PATCH 13/23] range_tree: drop duplicate zfs_ prefix from rs_set_fill_raw Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: hoshinomori Closes #17800 --- include/sys/range_tree.h | 5 ++--- module/zfs/range_tree.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 0f688468245..0f6def36f9f 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -238,8 +238,7 @@ zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) } static inline void -zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, - uint64_t fill) +zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { @@ -277,7 +276,7 @@ static inline void zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); - zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); + zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); } typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index ea2d2c7227c..d73195f1a21 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -585,7 +585,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - zfs_rs_get_start_raw(rs, rt)); zfs_range_tree_stat_incr(rt, &rs_tmp); From 102ff2a640becc1f78acb7945bd6a93bf10cd205 Mon Sep 17 00:00:00 2001 From: nav1s <42621369+nav1s@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:43:22 +0300 Subject: [PATCH 14/23] manuals: fix typos in zpool-upgrade man page Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: nav1s Closes #17797 --- man/man8/zpool-upgrade.8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man8/zpool-upgrade.8 b/man/man8/zpool-upgrade.8 index cf69060da5c..adae47f82eb 100644 --- a/man/man8/zpool-upgrade.8 +++ b/man/man8/zpool-upgrade.8 @@ -65,10 +65,10 @@ property). .Cm upgrade .Fl v .Xc -Displays legacy ZFS versions supported by the this version of ZFS. +Displays legacy ZFS versions supported by this version of ZFS. See .Xr zpool-features 7 -for a description of feature flags features supported by this version of ZFS. +for a description of features supported by this version of ZFS. .It Xo .Nm zpool .Cm upgrade From 32ce74ff328ea00f84c3bca9fdee6ecbc0c805a0 Mon Sep 17 00:00:00 2001 From: Shreshth3 <66148173+Shreshth3@users.noreply.github.com> Date: Wed, 1 Oct 2025 10:15:46 -0700 Subject: [PATCH 15/23] docs: fix a few small typos (#17804) Signed-off-by: Shreshth Srivastava Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Tony Hutter --- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- contrib/intel_qat/readme.md | 2 +- etc/init.d/README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 9b50a4a3d96..f3d4316f6f6 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -14,7 +14,7 @@ Please check our issue tracker before opening a new feature request. Filling out the following template will help other contributors better understand your proposed feature. --> -### Describe the feature would like to see added to OpenZFS +### Describe the feature you would like to see added to OpenZFS