diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 37ff92a816f..1b5fd595830 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -229,6 +229,7 @@ static const struct errstr errstrtable[] = { { ECHILD, "dtl" }, { EILSEQ, "corrupt" }, { ENOSYS, "noop" }, + { EFAULT, "io-prefail" }, { 0, NULL }, }; @@ -308,7 +309,8 @@ usage(void) "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n" - "\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n" + "\t\t'corrupt' (bit flip), 'io-prefail' (unsuccessfully do\n" + "\t\tnothing) or 'noop' (successfully do nothing).\n" "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" "\t\tdevice error injection to a percentage of the IOs.\n" "\n" @@ -1026,7 +1028,8 @@ main(int argc, char **argv) if (error < 0) { (void) fprintf(stderr, "invalid error type " "'%s': must be one of: io decompress " - "decrypt nxio dtl corrupt noop\n", + "decrypt nxio dtl corrupt noop " + "io-prefail\n", optarg); usage(); libzfs_fini(g_zfs); diff --git a/include/sys/zio.h b/include/sys/zio.h index acb0a03a36b..c3a199ce813 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -243,6 +243,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31) #define ZIO_FLAG_PREALLOCATED (1ULL << 32) +#define ZIO_FLAG_POSTREAD (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 82b0a890e0b..3c5e08fdf41 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2388,6 +2388,13 @@ and may need to load new metaslabs to satisfy these allocations. .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . +.It Sy zfs_scrub_partial_writes Ns = Ns Sy 1 Ns | Ns 0 Pq int +If a write to a multi-disk vdev fails, but the data is recoverable, the data is +persisted on disk but may not be as redundant as the vdev usually ensures. +If this tunable is set, we issue a read after such a write error to detect the +full extent of the problem and attempt to recover from it. +Note: This currently only works with RAID-Z and dRAID. +. .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 092af93211c..cda6d337864 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -237,8 +237,10 @@ for an ECHILD error, for an EIO error where reopening the device will succeed, .It Sy nxio for an ENXIO error where reopening the device will fail, or +.It Sy io-prefail +to drop the IO without executing it and return failure, or .It Sy noop -to drop the IO without executing it, and return success. +to drop the IO without executing it and return success. .El .Pp For EIO and ENXIO, the "failed" reads or writes still occur. diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index aa44acbf39c..2db7422e772 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -406,6 +406,16 @@ static unsigned long raidz_io_aggregate_rows = 4; */ static int zfs_scrub_after_expand = 1; +/* + * If there are errors when writing, but few enough that the data is + * recoverable, then ZFS used to silently move on, leaving the data not 100% + * redundant. If this tunable is set, we issue a read after that case occurs, + * allowing the normal error recovery process to handle it. + * + * NOTE: Currently applies only to raidz and draid. + */ +static int zfs_scrub_partial_writes = 1; + static void vdev_raidz_row_free(raidz_row_t *rr) { @@ -3641,6 +3651,7 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { int normal_errors = 0; int shadow_errors = 0; + int retryable_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -3657,6 +3668,11 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) ASSERT(rc->rc_shadow_error != ECKSUM); shadow_errors++; } + if (rc->rc_error || rc->rc_shadow_error) { + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; + if (!(vdev_is_dead(cvd) || cvd->vdev_cant_write)) + retryable_errors++; + } } /* @@ -3676,6 +3692,8 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); + } else if (retryable_errors && zfs_scrub_partial_writes) { + zio->io_flags |= ZIO_FLAG_POSTREAD; } } @@ -5528,6 +5546,9 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs, zfs_, scrub_partial_writes, INT, ZMOD_RW, + "Issue reads after writes with recoverable failures to ensure " + "integrity"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, "Raidz/draid slow disk sit out time period in seconds"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 5c2c984c34b..94b44561bd9 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1676,10 +1676,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_ALLOC_THROTTLED && - (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY)) && + type == ZIO_TYPE_WRITE) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); - ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || @@ -4779,11 +4779,17 @@ zio_vdev_io_start(zio_t *zio) } zio->io_delay = gethrtime(); - if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { + int error = zio_handle_device_injections(vd, zio, ENOSYS, + EFAULT); + if (error == ENOSYS || (error == EFAULT && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR))) { /* * "no-op" injections return success, but do no actual - * work. Just return it. + * work. Just return it. "io-prefail" injections are + * similar, but don't return success. */ + if (error == EFAULT) + zio->io_error = EIO; zio_delay_interrupt(zio); return (NULL); } @@ -5513,6 +5519,12 @@ zio_dva_throttle_done(zio_t *zio) } } +static void +zio_done_postread_done(zio_t *zio) +{ + abd_free(zio->io_abd); +} + static zio_t * zio_done(zio_t *zio) { @@ -5843,6 +5855,24 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } + if (zio->io_flags & ZIO_FLAG_POSTREAD) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + zl = NULL; + zio_t *pio = zio_walk_parents(zio, &zl); + blkptr_t *bp = zio->io_bp; + abd_t *abd = abd_alloc_for_io(BP_GET_PSIZE(bp), B_FALSE); + zio_priority_t prio = zio->io_priority == + ZIO_PRIORITY_SYNC_WRITE ? ZIO_PRIORITY_SYNC_READ : + ZIO_PRIORITY_SCRUB; + zio_t *cio = zio_vdev_child_io(pio, zio->io_bp, zio->io_vd, + zio->io_offset, abd, zio->io_size, ZIO_TYPE_READ, prio, + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER | ZIO_FLAG_DONT_PROPAGATE, + zio_done_postread_done, NULL); + cio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED; + zio_nowait(cio); + } + /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 243d28e8bc4..c50008e0b2b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -912,7 +912,7 @@ tags = ['functional', 'redacted_send'] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos', 'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos', 'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg', - 'raidz_expand_007_neg'] + 'raidz_expand_007_neg', 'raidz_zinject'] tags = ['functional', 'raidz'] timeout = 1200 diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 974e19c0426..39e63bed7bf 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2019,6 +2019,26 @@ function wait_sit_out #pool vdev timeout return 1 } +# +# Check the output of 'zpool status -v ', +# and to see if the counts of contain the specified. +# +# Return 0 is contain, 1 otherwise +# +function check_pool_device # pool device regex +{ + typeset pool=$1 + typeset device=$2 + typeset regex=$3 + typeset verbose=${4:-false} + + scan=$(zpool status -v "$pool" 2>/dev/null | grep $device) + if [[ $verbose == true ]]; then + log_note $scan + fi + echo $scan | grep -qi "$regex" +} + # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index cf04950a961..c3aad114af4 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1884,6 +1884,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/raidz/raidz_expand_005_pos.ksh \ functional/raidz/raidz_expand_006_neg.ksh \ functional/raidz/raidz_expand_007_neg.ksh \ + functional/raidz/raidz_zinject.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh index 19351dc8f2d..93c320da6fd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh @@ -48,7 +48,7 @@ function cleanup function test_device_fault { - typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop") + typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop" "io-prefail") for e in ${errno[@]}; do log_must eval \ "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_zinject.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_zinject.ksh new file mode 100755 index 00000000000..e0417afc775 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_zinject.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Inject an io-prefail error on a child of a raidz device, then write +# some data and verify that the pool encountered errors. +# + +function cleanup +{ + log_pos zpool status $TESTPOOL + + log_must zinject -c all + + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {1..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + +} + +log_onexit cleanup + +typeset -r devs=6 +typeset -r dev_size_mb=128 + +typeset -a disks + +# Disk files which will be used by pool +for i in {1..$devs}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +function run_test +{ + log_must zpool create -f -o cachefile=none -O recordsize=16k $TESTPOOL raidz1 ${disks[@]} + + log_must zinject -d $TEST_BASE_DIR/dev-1 -e io-prefail -T write -f 25 $TESTPOOL + + log_must file_write -o create -f /$TESTPOOL/file -b 128k -c 1000 -d R + log_must zpool sync $TESTPOOL + log_pos check_pool_status $TESTPOOL "errors" "No known data errors" || return 1 + log_pos check_pool_status $TESTPOOL "status" "One or more" || return 1 + + log_must zinject -c all + log_must zpool export -f $TESTPOOL + log_must rm $TEST_BASE_DIR/dev-2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + log_must zpool scrub $TESTPOOL + log_must zpool wait -t scrub $TESTPOOL + log_pos check_pool_status $TESTPOOL "errors" "No known data" || return 1 + log_pos check_pool_device $TESTPOOL "dev-1" "ONLINE.* 0$" || return 1 +} + +i=0 +while [[ $i -lt 3 ]]; do + run_test && log_pass "raidz handles partial write failure." + log_must zinject -c all + log_must zpool destroy $TESTPOOL + log_must truncate -s ${dev_size_mb}M $TEST_BASE_DIR/dev-2 + i=$((i + 1)) +done + +log_fail "raidz does not handle partial write failure."