Fix double free for blocks cloned after DDT prune

Before this change, for blocks marked with D flag but absent in DDT
(pruned from it), zio_ddt_free() fell back to ZIO_STAGE_DVA_FREE
without trying ZIO_STAGE_BRT_FREE first.  Same time such blocks
might be present in BRT, and not handling that would result in
double/multiple free.

This change makes ZIO_DDT_FREE_PIPELINE include ZIO_FREE_PIPELINE,
just adding required ZIO_STAGE_ISSUE_ASYNC and ZIO_STAGE_DDT_FREE,
and moves DDT stages before BRT.  This way, if the block is found
in DDT by zio_ddt_free(), the pipeline is short-circuited to
ZIO_INTERLOCK_PIPELINE, similar to what zio_brt_free() does.  If
not, then BRT is checked, and if also no match, the block is freed.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@truenas.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18520
This commit is contained in:
Alexander Motin
2026-05-11 16:26:09 -04:00
committed by Brian Behlendorf
parent 58c8dc5f69
commit 181e1b5227
7 changed files with 184 additions and 25 deletions
+6 -7
View File
@@ -139,12 +139,12 @@ enum zio_stage {
ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */
ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */
ZIO_STAGE_DDT_READ_START = 1 << 9, /* R----- */
ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R----- */
ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W---- */
ZIO_STAGE_DDT_FREE = 1 << 12, /* --F--- */
ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */
ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */
ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */
ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */
ZIO_STAGE_BRT_FREE = 1 << 13, /* --F--- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */
ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */
@@ -259,8 +259,7 @@ enum zio_stage {
ZIO_STAGE_DVA_FREE)
#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
(ZIO_FREE_PIPELINE | \
ZIO_STAGE_ISSUE_ASYNC | \
ZIO_STAGE_DDT_FREE)
+5 -5
View File
@@ -458,12 +458,12 @@ ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W----
ZIO_STAGE_NOP_WRITE:0x00000100:-W----
ZIO_STAGE_BRT_FREE:0x00000200:--F---
ZIO_STAGE_DDT_READ_START:0x00000200:R-----
ZIO_STAGE_DDT_READ_DONE:0x00000400:R-----
ZIO_STAGE_DDT_WRITE:0x00000800:-W----
ZIO_STAGE_DDT_FREE:0x00001000:--F---
ZIO_STAGE_DDT_READ_START:0x00000400:R-----
ZIO_STAGE_DDT_READ_DONE:0x00000800:R-----
ZIO_STAGE_DDT_WRITE:0x00001000:-W----
ZIO_STAGE_DDT_FREE:0x00002000:--F---
ZIO_STAGE_BRT_FREE:0x00002000:--F---
ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC--
ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC--
+1 -1
View File
@@ -238,11 +238,11 @@ _VALSTR_BITFIELD_IMPL(zio_stage,
{ 'E', "EN", "ENCRYPT" },
{ 'C', "CG", "CHECKSUM_GENERATE" },
{ 'N', "NW", "NOP_WRITE" },
{ 'B', "BF", "BRT_FREE" },
{ 'd', "dS", "DDT_READ_START" },
{ 'd', "dD", "DDT_READ_DONE" },
{ 'd', "dW", "DDT_WRITE" },
{ 'd', "dF", "DDT_FREE" },
{ 'B', "BF", "BRT_FREE" },
{ 'G', "GA", "GANG_ASSEMBLE" },
{ 'G', "GI", "GANG_ISSUE" },
{ 'D', "DT", "DVA_THROTTLE" },
+13 -6
View File
@@ -4168,14 +4168,21 @@ zio_ddt_free(zio_t *zio)
}
ddt_exit(ddt);
if (dde) {
/*
* When no entry was found, it must have been pruned,
* so we can free it now instead of decrementing the
* refcount in the DDT.
* DDT entry found and the refcount has been decremented.
* Stop the pipeline — there is nothing more to do right now.
*/
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
/*
* No DDT entry; the block must have been pruned from the
* table. Clear the DEDUP bit so it is treated as a normal
* block from here on. BRT_FREE and DVA_FREE follow in the
* pipeline and will handle any cloned references and the
* actual block free respectively.
*/
if (!dde) {
BP_SET_DEDUP(bp, 0);
zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
}
return (zio);
@@ -5925,11 +5932,11 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_encrypt,
zio_checksum_generate,
zio_nop_write,
zio_brt_free,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,
zio_ddt_free,
zio_brt_free,
zio_gang_assemble,
zio_gang_issue,
zio_dva_throttle,
+4 -4
View File
@@ -717,10 +717,10 @@ post =
tags = ['functional', 'deadman']
[tests/functional/dedup]
tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing',
'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_prune_leak',
'dedup_zap_shrink']
tests = ['dedup_bclone_pruned', 'dedup_fdt_create', 'dedup_fdt_import',
'dedup_fdt_pacing', 'dedup_legacy_create', 'dedup_legacy_import',
'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota',
'dedup_prune', 'dedup_prune_leak', 'dedup_zap_shrink']
pre =
post =
tags = ['functional', 'dedup']
+1
View File
@@ -1503,6 +1503,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/deadman/deadman_zio.ksh \
functional/dedup/cleanup.ksh \
functional/dedup/setup.ksh \
functional/dedup/dedup_bclone_pruned.ksh \
functional/dedup/dedup_fdt_create.ksh \
functional/dedup/dedup_fdt_import.ksh \
functional/dedup/dedup_fdt_pacing.ksh \
@@ -0,0 +1,152 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2026, TrueNAS.
#
#
# DESCRIPTION:
# Verify that block cloning works correctly when the DDT entry for a
# dedup block has been pruned. When a block has the DEDUP bit set but
# no DDT entry (because it was pruned), cloning it must create a BRT
# entry to track the extra reference. Freeing the original must
# consult the BRT rather than proceeding directly to a DVA free,
# otherwise the block is freed while the clone still references it.
#
# STRATEGY:
# 1. Create a pool with both dedup and block_cloning enabled
# 2. Write a file with dedup=on so blocks get DEDUP bit set in their BPs
# 3. Prune the DDT to remove those entries (blocks remain, DEDUP bit
# stays set in block pointers)
# 4. Clone the file - brt_pending_apply_vdev() must fall back to BRT
# since ddt_addref() returns B_FALSE for pruned entries
# 5. Write a second copy via dd - same hash, new physical blocks, new
# DDT entries at different DVAs from the BRT-tracked blocks
# 6. Delete the clone first - must go through BRT, not DDT, even though
# a matching DDT entry now exists for the same hash
# 7. Delete the dd copy - DDT entries freed normally
# 8. Delete the original - no DDT entry, no BRT entry, DVA freed
# 9. Verify reference counts with zdb -b at each step
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
log_assert "Block cloning of dedup blocks with pruned DDT entries uses BRT"
# Flush DDT log every TXG so entries appear in the ZAP immediately,
# making ddtprune effective and test behavior predictable.
log_must save_tunable DEDUP_LOG_TXG_MAX
log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000
function cleanup
{
if poolexists $TESTPOOL ; then
destroy_pool $TESTPOOL
fi
log_must restore_tunable DEDUP_LOG_TXG_MAX
log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
}
log_onexit cleanup
log_must zpool create -f -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must zfs create -o dedup=sha256 -o recordsize=128k $TESTPOOL/$TESTFS
typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
# Write unique data: each block gets a DDT entry with refcnt=1.
log_must dd if=/dev/urandom of=$mountpoint/file1 bs=128k count=8
sync_pool $TESTPOOL
# Verify DDT has entries before pruning.
typeset entries=$(zpool status -D $TESTPOOL | \
grep "dedup: DDT entries" | awk '{print $4}')
log_must test "$entries" -eq 8
# Sleep 1s so the DDT entries are at least 1 second old. ddtprune uses
# an age-based cutoff and will silently skip entries that are too fresh.
sleep 1
# Prune all unique (refcnt=1) entries. The blocks remain on disk and the
# block pointers in file1 still have the DEDUP bit set, but there is no
# longer a DDT entry for them.
log_must zpool ddtprune -p 100 $TESTPOOL
sync_pool $TESTPOOL
# Confirm the prune actually removed all entries.
entries=$(zpool status -D $TESTPOOL | \
grep "dedup: DDT entries" | awk '{print $4}')
[[ -z "$entries" || "$entries" -eq 0 ]] || \
log_fail "DDT entries not pruned: $entries remain"
# Clone file1. brt_pending_apply_vdev() will see the DEDUP bit, call
# ddt_addref(), receive B_FALSE (no DDT entry), and fall through to
# create BRT entries instead.
log_must clonefile -f $mountpoint/file1 $mountpoint/clone1
sync_pool $TESTPOOL
# BRT entries exist; reference counts must be consistent.
log_must zdb -b $TESTPOOL
# Write a second copy via dd. Since the DDT was pruned, dedup can't find
# an existing entry and writes new physical blocks at new DVAs, creating
# fresh DDT entries with refcnt=1. The BRT-tracked blocks (file1/clone1)
# are at the old DVAs and are unaffected.
log_must dd if=$mountpoint/file1 of=$mountpoint/file2 bs=128k
sync_pool $TESTPOOL
# Eight new unique DDT entries (file2's blocks); BRT still holds refs for
# file1/clone1's old blocks.
typeset entries=$(zpool status -D $TESTPOOL | \
grep "dedup: DDT entries" | awk '{print $4}')
log_must test "$entries" -eq 8
log_must zdb -b $TESTPOOL
# Delete the clone first. Its blocks carry the DEDUP bit and the same
# hash as file2's DDT entries, but the DVAs differ — the free must go
# through BRT, not DDT, leaving file2's DDT entries intact.
log_must rm $mountpoint/clone1
sync_pool $TESTPOOL
entries=$(zpool status -D $TESTPOOL | \
grep "dedup: DDT entries" | awk '{print $4}')
log_must test "$entries" -eq 8
log_must zdb -b $TESTPOOL
# Delete file2. DDT entries freed; file1's BRT-tracked blocks unaffected.
log_must rm $mountpoint/file2
sync_pool $TESTPOOL
log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
log_must zdb -b $TESTPOOL
# Delete the original. No DDT entry, no BRT entry; DVA freed directly.
log_must rm $mountpoint/file1
sync_pool $TESTPOOL
log_must zdb -b $TESTPOOL
log_pass "Block cloning of dedup blocks with pruned DDT entries uses BRT"