From 181e1b52276ae29997902faf886d8298a77b39f8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 11 May 2026 16:26:09 -0400 Subject: [PATCH] Fix double free for blocks cloned after DDT prune Before this change, for blocks marked with D flag but absent in DDT (pruned from it), zio_ddt_free() fell back to ZIO_STAGE_DVA_FREE without trying ZIO_STAGE_BRT_FREE first. Same time such blocks might be present in BRT, and not handling that would result in double/multiple free. This change makes ZIO_DDT_FREE_PIPELINE include ZIO_FREE_PIPELINE, just adding required ZIO_STAGE_ISSUE_ASYNC and ZIO_STAGE_DDT_FREE, and moves DDT stages before BRT. This way, if the block is found in DDT by zio_ddt_free(), the pipeline is short-circuited to ZIO_INTERLOCK_PIPELINE, similar to what zio_brt_free() does. If not, then BRT is checked, and if also no match, the block is freed. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Closes #18520 --- include/sys/zio_impl.h | 13 +- man/man8/zpool-events.8 | 10 +- module/zcommon/zfs_valstr.c | 2 +- module/zfs/zio.c | 23 ++- tests/runfiles/common.run | 8 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/dedup/dedup_bclone_pruned.ksh | 152 ++++++++++++++++++ 7 files changed, 184 insertions(+), 25 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 42147adaf1a..62e7e27da38 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -139,12 +139,12 @@ enum zio_stage { ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */ - ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */ + ZIO_STAGE_DDT_READ_START = 1 << 9, /* R----- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R----- */ + ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W---- */ + ZIO_STAGE_DDT_FREE = 1 << 12, /* --F--- */ - ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */ - ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */ - ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */ + ZIO_STAGE_BRT_FREE = 1 << 13, /* --F--- */ ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */ ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */ @@ -259,8 +259,7 @@ enum zio_stage { ZIO_STAGE_DVA_FREE) #define ZIO_DDT_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_FREE_BP_INIT | \ + (ZIO_FREE_PIPELINE | \ ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_DDT_FREE) diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 3753139bdfe..12a11058072 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -458,12 +458,12 @@ ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---- ZIO_STAGE_NOP_WRITE:0x00000100:-W---- -ZIO_STAGE_BRT_FREE:0x00000200:--F--- +ZIO_STAGE_DDT_READ_START:0x00000200:R----- +ZIO_STAGE_DDT_READ_DONE:0x00000400:R----- +ZIO_STAGE_DDT_WRITE:0x00000800:-W---- +ZIO_STAGE_DDT_FREE:0x00001000:--F--- -ZIO_STAGE_DDT_READ_START:0x00000400:R----- -ZIO_STAGE_DDT_READ_DONE:0x00000800:R----- -ZIO_STAGE_DDT_WRITE:0x00001000:-W---- -ZIO_STAGE_DDT_FREE:0x00002000:--F--- +ZIO_STAGE_BRT_FREE:0x00002000:--F--- ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC-- ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC-- diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c index 0cb9f584acc..41a2313e575 100644 --- a/module/zcommon/zfs_valstr.c +++ b/module/zcommon/zfs_valstr.c @@ -238,11 +238,11 @@ _VALSTR_BITFIELD_IMPL(zio_stage, { 'E', "EN", "ENCRYPT" }, { 'C', "CG", "CHECKSUM_GENERATE" }, { 'N', "NW", "NOP_WRITE" }, - { 'B', "BF", "BRT_FREE" }, { 'd', "dS", "DDT_READ_START" }, { 'd', "dD", "DDT_READ_DONE" }, { 'd', "dW", "DDT_WRITE" }, { 'd', "dF", "DDT_FREE" }, + { 'B', "BF", "BRT_FREE" }, { 'G', "GA", "GANG_ASSEMBLE" }, { 'G', "GI", "GANG_ISSUE" }, { 'D', "DT", "DVA_THROTTLE" }, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 94b44561bd9..3e95103385c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4168,14 +4168,21 @@ zio_ddt_free(zio_t *zio) } ddt_exit(ddt); - /* - * When no entry was found, it must have been pruned, - * so we can free it now instead of decrementing the - * refcount in the DDT. - */ - if (!dde) { + if (dde) { + /* + * DDT entry found and the refcount has been decremented. + * Stop the pipeline — there is nothing more to do right now. + */ + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } else { + /* + * No DDT entry; the block must have been pruned from the + * table. Clear the DEDUP bit so it is treated as a normal + * block from here on. BRT_FREE and DVA_FREE follow in the + * pipeline and will handle any cloned references and the + * actual block free respectively. + */ BP_SET_DEDUP(bp, 0); - zio->io_pipeline |= ZIO_STAGE_DVA_FREE; } return (zio); @@ -5925,11 +5932,11 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_encrypt, zio_checksum_generate, zio_nop_write, - zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, + zio_brt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f18835da74b..fbce8c8db65 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -717,10 +717,10 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing', - 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', - 'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_prune_leak', - 'dedup_zap_shrink'] +tests = ['dedup_bclone_pruned', 'dedup_fdt_create', 'dedup_fdt_import', + 'dedup_fdt_pacing', 'dedup_legacy_create', 'dedup_legacy_import', + 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota', + 'dedup_prune', 'dedup_prune_leak', 'dedup_zap_shrink'] pre = post = tags = ['functional', 'dedup'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 5dd350ece7c..7c8dbfe5fcd 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1503,6 +1503,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/deadman/deadman_zio.ksh \ functional/dedup/cleanup.ksh \ functional/dedup/setup.ksh \ + functional/dedup/dedup_bclone_pruned.ksh \ functional/dedup/dedup_fdt_create.ksh \ functional/dedup/dedup_fdt_import.ksh \ functional/dedup/dedup_fdt_pacing.ksh \ diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh new file mode 100755 index 00000000000..d01d09ac12e --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, TrueNAS. +# + +# +# DESCRIPTION: +# Verify that block cloning works correctly when the DDT entry for a +# dedup block has been pruned. When a block has the DEDUP bit set but +# no DDT entry (because it was pruned), cloning it must create a BRT +# entry to track the extra reference. Freeing the original must +# consult the BRT rather than proceeding directly to a DVA free, +# otherwise the block is freed while the clone still references it. +# +# STRATEGY: +# 1. Create a pool with both dedup and block_cloning enabled +# 2. Write a file with dedup=on so blocks get DEDUP bit set in their BPs +# 3. Prune the DDT to remove those entries (blocks remain, DEDUP bit +# stays set in block pointers) +# 4. Clone the file - brt_pending_apply_vdev() must fall back to BRT +# since ddt_addref() returns B_FALSE for pruned entries +# 5. Write a second copy via dd - same hash, new physical blocks, new +# DDT entries at different DVAs from the BRT-tracked blocks +# 6. Delete the clone first - must go through BRT, not DDT, even though +# a matching DDT entry now exists for the same hash +# 7. Delete the dd copy - DDT entries freed normally +# 8. Delete the original - no DDT entry, no BRT entry, DVA freed +# 9. Verify reference counts with zdb -b at each step +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Block cloning of dedup blocks with pruned DDT entries uses BRT" + +# Flush DDT log every TXG so entries appear in the ZAP immediately, +# making ddtprune effective and test behavior predictable. +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000 + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + log_must restore_tunable DEDUP_LOG_TXG_MAX + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +} + +log_onexit cleanup + +log_must zpool create -f -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must zfs create -o dedup=sha256 -o recordsize=128k $TESTPOOL/$TESTFS +typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +# Write unique data: each block gets a DDT entry with refcnt=1. +log_must dd if=/dev/urandom of=$mountpoint/file1 bs=128k count=8 + +sync_pool $TESTPOOL + +# Verify DDT has entries before pruning. +typeset entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +log_must test "$entries" -eq 8 + +# Sleep 1s so the DDT entries are at least 1 second old. ddtprune uses +# an age-based cutoff and will silently skip entries that are too fresh. +sleep 1 + +# Prune all unique (refcnt=1) entries. The blocks remain on disk and the +# block pointers in file1 still have the DEDUP bit set, but there is no +# longer a DDT entry for them. +log_must zpool ddtprune -p 100 $TESTPOOL +sync_pool $TESTPOOL + +# Confirm the prune actually removed all entries. +entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +[[ -z "$entries" || "$entries" -eq 0 ]] || \ + log_fail "DDT entries not pruned: $entries remain" + +# Clone file1. brt_pending_apply_vdev() will see the DEDUP bit, call +# ddt_addref(), receive B_FALSE (no DDT entry), and fall through to +# create BRT entries instead. +log_must clonefile -f $mountpoint/file1 $mountpoint/clone1 +sync_pool $TESTPOOL + +# BRT entries exist; reference counts must be consistent. +log_must zdb -b $TESTPOOL + +# Write a second copy via dd. Since the DDT was pruned, dedup can't find +# an existing entry and writes new physical blocks at new DVAs, creating +# fresh DDT entries with refcnt=1. The BRT-tracked blocks (file1/clone1) +# are at the old DVAs and are unaffected. +log_must dd if=$mountpoint/file1 of=$mountpoint/file2 bs=128k +sync_pool $TESTPOOL + +# Eight new unique DDT entries (file2's blocks); BRT still holds refs for +# file1/clone1's old blocks. +typeset entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +log_must test "$entries" -eq 8 +log_must zdb -b $TESTPOOL + +# Delete the clone first. Its blocks carry the DEDUP bit and the same +# hash as file2's DDT entries, but the DVAs differ — the free must go +# through BRT, not DDT, leaving file2's DDT entries intact. +log_must rm $mountpoint/clone1 +sync_pool $TESTPOOL + +entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +log_must test "$entries" -eq 8 +log_must zdb -b $TESTPOOL + +# Delete file2. DDT entries freed; file1's BRT-tracked blocks unaffected. +log_must rm $mountpoint/file2 +sync_pool $TESTPOOL +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" +log_must zdb -b $TESTPOOL + +# Delete the original. No DDT entry, no BRT entry; DVA freed directly. +log_must rm $mountpoint/file1 +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + +log_pass "Block cloning of dedup blocks with pruned DDT entries uses BRT"