Handle raidz errors <= nparity rather than ignoring

This PR adds a check in the mirror and raidz code for the case where 
there are errors <= nparity. In that case, ZFS sets a new flag on 
the zio that will be checked in zio_done. If that flag is set, when 
the write IO completes, we issue a read IO for the same blkptr. 
That will allow ZFS's auto-healing mechanisms and other errors 
recovery tools to detect the effectively-corrupt data, and handle 
it accordingly. Note that because draid raidz's IO done function, 
it also benefits from this functionality.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #18387
This commit is contained in:
Paul Dagnelie
2026-04-21 14:17:37 -07:00
committed by GitHub
parent f798b40000
commit 6562851406
11 changed files with 188 additions and 9 deletions
+5 -2
View File
@@ -229,6 +229,7 @@ static const struct errstr errstrtable[] = {
{ ECHILD, "dtl" },
{ EILSEQ, "corrupt" },
{ ENOSYS, "noop" },
{ EFAULT, "io-prefail" },
{ 0, NULL },
};
@@ -308,7 +309,8 @@ usage(void)
"\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "
"\t\t'pad1', or 'pad2'.\n"
"\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n"
"\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n"
"\t\t'corrupt' (bit flip), 'io-prefail' (unsuccessfully do\n"
"\t\tnothing) or 'noop' (successfully do nothing).\n"
"\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
"\t\tdevice error injection to a percentage of the IOs.\n"
"\n"
@@ -1026,7 +1028,8 @@ main(int argc, char **argv)
if (error < 0) {
(void) fprintf(stderr, "invalid error type "
"'%s': must be one of: io decompress "
"decrypt nxio dtl corrupt noop\n",
"decrypt nxio dtl corrupt noop "
"io-prefail\n",
optarg);
usage();
libzfs_fini(g_zfs);
+1
View File
@@ -243,6 +243,7 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
#define ZIO_FLAG_DELEGATED (1ULL << 31)
#define ZIO_FLAG_PREALLOCATED (1ULL << 32)
#define ZIO_FLAG_POSTREAD (1ULL << 33)
#define ZIO_ALLOCATOR_NONE (-1)
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
+7
View File
@@ -2388,6 +2388,13 @@ and may need to load new metaslabs to satisfy these allocations.
.It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
Rewrite new block pointers starting in this pass.
.
.It Sy zfs_scrub_partial_writes Ns = Ns Sy 1 Ns | Ns 0 Pq int
If a write to a multi-disk vdev fails, but the data is recoverable, the data is
persisted on disk but may not be as redundant as the vdev usually ensures.
If this tunable is set, we issue a read after such a write error to detect the
full extent of the problem and attempt to recover from it.
Note: This currently only works with RAID-Z and dRAID.
.
.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
Maximum size of TRIM command.
Larger ranges will be split into chunks no larger than this value before
+3 -1
View File
@@ -237,8 +237,10 @@ for an ECHILD error,
for an EIO error where reopening the device will succeed,
.It Sy nxio
for an ENXIO error where reopening the device will fail, or
.It Sy io-prefail
to drop the IO without executing it and return failure, or
.It Sy noop
to drop the IO without executing it, and return success.
to drop the IO without executing it and return success.
.El
.Pp
For EIO and ENXIO, the "failed" reads or writes still occur.
+21
View File
@@ -406,6 +406,16 @@ static unsigned long raidz_io_aggregate_rows = 4;
*/
static int zfs_scrub_after_expand = 1;
/*
* If there are errors when writing, but few enough that the data is
* recoverable, then ZFS used to silently move on, leaving the data not 100%
* redundant. If this tunable is set, we issue a read after that case occurs,
* allowing the normal error recovery process to handle it.
*
* NOTE: Currently applies only to raidz and draid.
*/
static int zfs_scrub_partial_writes = 1;
static void
vdev_raidz_row_free(raidz_row_t *rr)
{
@@ -3641,6 +3651,7 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
{
int normal_errors = 0;
int shadow_errors = 0;
int retryable_errors = 0;
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
@@ -3657,6 +3668,11 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
ASSERT(rc->rc_shadow_error != ECKSUM);
shadow_errors++;
}
if (rc->rc_error || rc->rc_shadow_error) {
vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(vdev_is_dead(cvd) || cvd->vdev_cant_write))
retryable_errors++;
}
}
/*
@@ -3676,6 +3692,8 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
shadow_errors > rr->rr_firstdatacol) {
zio->io_error = zio_worst_error(zio->io_error,
vdev_raidz_worst_error(rr));
} else if (retryable_errors && zfs_scrub_partial_writes) {
zio->io_flags |= ZIO_FLAG_POSTREAD;
}
}
@@ -5528,6 +5546,9 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
"For expanded RAIDZ, automatically start a pool scrub when expansion "
"completes");
ZFS_MODULE_PARAM(zfs, zfs_, scrub_partial_writes, INT, ZMOD_RW,
"Issue reads after writes with recoverable failures to ensure "
"integrity");
ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
"Raidz/draid slow disk sit out time period in seconds");
ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
+34 -4
View File
@@ -1676,10 +1676,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* have already processed the original allocating I/O.
*/
if (flags & ZIO_FLAG_ALLOC_THROTTLED &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY)) &&
type == ZIO_TYPE_WRITE) {
ASSERT(pio->io_metaslab_class != NULL);
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
@@ -4779,11 +4779,17 @@ zio_vdev_io_start(zio_t *zio)
}
zio->io_delay = gethrtime();
if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
int error = zio_handle_device_injections(vd, zio, ENOSYS,
EFAULT);
if (error == ENOSYS || (error == EFAULT &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR))) {
/*
* "no-op" injections return success, but do no actual
* work. Just return it.
* work. Just return it. "io-prefail" injections are
* similar, but don't return success.
*/
if (error == EFAULT)
zio->io_error = EIO;
zio_delay_interrupt(zio);
return (NULL);
}
@@ -5513,6 +5519,12 @@ zio_dva_throttle_done(zio_t *zio)
}
}
static void
zio_done_postread_done(zio_t *zio)
{
abd_free(zio->io_abd);
}
static zio_t *
zio_done(zio_t *zio)
{
@@ -5843,6 +5855,24 @@ zio_done(zio_t *zio)
zfs_ereport_free_checksum(zcr);
}
if (zio->io_flags & ZIO_FLAG_POSTREAD) {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
zl = NULL;
zio_t *pio = zio_walk_parents(zio, &zl);
blkptr_t *bp = zio->io_bp;
abd_t *abd = abd_alloc_for_io(BP_GET_PSIZE(bp), B_FALSE);
zio_priority_t prio = zio->io_priority ==
ZIO_PRIORITY_SYNC_WRITE ? ZIO_PRIORITY_SYNC_READ :
ZIO_PRIORITY_SCRUB;
zio_t *cio = zio_vdev_child_io(pio, zio->io_bp, zio->io_vd,
zio->io_offset, abd, zio->io_size, ZIO_TYPE_READ, prio,
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
ZIO_FLAG_RESILVER | ZIO_FLAG_DONT_PROPAGATE,
zio_done_postread_done, NULL);
cio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
zio_nowait(cio);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
+1 -1
View File
@@ -912,7 +912,7 @@ tags = ['functional', 'redacted_send']
tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos',
'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos',
'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg',
'raidz_expand_007_neg']
'raidz_expand_007_neg', 'raidz_zinject']
tags = ['functional', 'raidz']
timeout = 1200
+20
View File
@@ -2019,6 +2019,26 @@ function wait_sit_out #pool vdev timeout
return 1
}
#
# Check the output of 'zpool status -v <pool>',
# and to see if the counts of <device> contain the <regex> specified.
#
# Return 0 is contain, 1 otherwise
#
function check_pool_device # pool device regex <verbose>
{
typeset pool=$1
typeset device=$2
typeset regex=$3
typeset verbose=${4:-false}
scan=$(zpool status -v "$pool" 2>/dev/null | grep $device)
if [[ $verbose == true ]]; then
log_note $scan
fi
echo $scan | grep -qi "$regex"
}
#
# Check the output of 'zpool status -v <pool>',
# and to see if the content of <token> contain the <keyword> specified.
+1
View File
@@ -1884,6 +1884,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/raidz/raidz_expand_005_pos.ksh \
functional/raidz/raidz_expand_006_neg.ksh \
functional/raidz/raidz_expand_007_neg.ksh \
functional/raidz/raidz_zinject.ksh \
functional/raidz/setup.ksh \
functional/redacted_send/cleanup.ksh \
functional/redacted_send/redacted_compressed.ksh \
@@ -48,7 +48,7 @@ function cleanup
function test_device_fault
{
typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop")
typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop" "io-prefail")
for e in ${errno[@]}; do
log_must eval \
"zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL"
+94
View File
@@ -0,0 +1,94 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2026, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Inject an io-prefail error on a child of a raidz device, then write
# some data and verify that the pool encountered errors.
#
function cleanup
{
log_pos zpool status $TESTPOOL
log_must zinject -c all
poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
for i in {1..$devs}; do
log_must rm -f "$TEST_BASE_DIR/dev-$i"
done
}
log_onexit cleanup
typeset -r devs=6
typeset -r dev_size_mb=128
typeset -a disks
# Disk files which will be used by pool
for i in {1..$devs}; do
device=$TEST_BASE_DIR/dev-$i
log_must truncate -s ${dev_size_mb}M $device
disks[${#disks[*]}+1]=$device
done
function run_test
{
log_must zpool create -f -o cachefile=none -O recordsize=16k $TESTPOOL raidz1 ${disks[@]}
log_must zinject -d $TEST_BASE_DIR/dev-1 -e io-prefail -T write -f 25 $TESTPOOL
log_must file_write -o create -f /$TESTPOOL/file -b 128k -c 1000 -d R
log_must zpool sync $TESTPOOL
log_pos check_pool_status $TESTPOOL "errors" "No known data errors" || return 1
log_pos check_pool_status $TESTPOOL "status" "One or more" || return 1
log_must zinject -c all
log_must zpool export -f $TESTPOOL
log_must rm $TEST_BASE_DIR/dev-2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL
log_must zpool scrub $TESTPOOL
log_must zpool wait -t scrub $TESTPOOL
log_pos check_pool_status $TESTPOOL "errors" "No known data" || return 1
log_pos check_pool_device $TESTPOOL "dev-1" "ONLINE.* 0$" || return 1
}
i=0
while [[ $i -lt 3 ]]; do
run_test && log_pass "raidz handles partial write failure."
log_must zinject -c all
log_must zpool destroy $TESTPOOL
log_must truncate -s ${dev_size_mb}M $TEST_BASE_DIR/dev-2
i=$((i + 1))
done
log_fail "raidz does not handle partial write failure."