From 437227a9cccc805889c80a4998276740197b5a24 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 4 Oct 2024 14:20:10 -0700 Subject: [PATCH 01/32] Update META Increase the version to 2.3.99 to indicate the master branch is newer than the 2.3.x release. This ensures packages built from master branch are considered to be newer than the last release. Signed-off-by: Brian Behlendorf --- META | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/META b/META index c2eff4ba0bb..185cca4a44d 100644 --- a/META +++ b/META @@ -1,8 +1,8 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.3.0 -Release: rc1 +Version: 2.3.99 +Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS From 87ca6ba9a8c5deb0fe610808035db7eefe3eeb71 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sun, 6 Oct 2024 23:29:20 +0200 Subject: [PATCH 02/32] ZTS: Remove FreeBSD 13.4-STABLE Current CI is failing on FreeBSD 13.4-STABLE, because samba4 can't be installed there. Lets remove it for now. Update also the FreeBSD version definitions a bit. The naming is like this now: FreeBSD variants: - freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r (RELEASE) - freebsd13-4s, freebsd14-1s (STABLE) - freebsd15-0c (CURRENT) RHL based distros: - almalinux8, almalinux9, centos-stream9, fedora39, fedora40 Debian based: - debian11, debian12, ubuntu20, ubuntu22, ubuntu24 Misc Linux distros: - archlinux, tumbleweed Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #16610 --- .github/workflows/scripts/qemu-2-start.sh | 32 ++++++++++++++++------- .github/workflows/zfs-qemu.yml | 14 +++++----- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 923c38c0f93..84e13832d10 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -14,7 +14,7 @@ OSv=$OS # compressed with .zst extension REPO="https://github.com/mcmilk/openzfs-freebsd-images" -FREEBSD="$REPO/releases/download/v2024-09-16" +FREEBSD="$REPO/releases/download/v2024-10-05" URLzs="" # Ubuntu mirrors @@ -62,33 +62,45 @@ case "$OS" in OSv="fedora39" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2" ;; - freebsd13r) + freebsd13-3r) + OSNAME="FreeBSD 13.3-RELEASE" + OSv="freebsd13.0" + URLzs="$FREEBSD/amd64-freebsd-13.3-RELEASE.qcow2.zst" + BASH="/usr/local/bin/bash" + NIC="rtl8139" + ;; + freebsd13-4r) OSNAME="FreeBSD 13.4-RELEASE" OSv="freebsd13.0" URLzs="$FREEBSD/amd64-freebsd-13.4-RELEASE.qcow2.zst" BASH="/usr/local/bin/bash" NIC="rtl8139" ;; - freebsd13) - OSNAME="FreeBSD 13.4-STABLE" - OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst" + freebsd14-0r) + OSNAME="FreeBSD 14.0-RELEASE" + OSv="freebsd14.0" + URLzs="$FREEBSD/amd64-freebsd-14.0-RELEASE.qcow2.zst" BASH="/usr/local/bin/bash" - NIC="rtl8139" ;; - freebsd14r) + freebsd14-1r) OSNAME="FreeBSD 14.1-RELEASE" OSv="freebsd14.0" URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst" BASH="/usr/local/bin/bash" ;; - freebsd14) + freebsd13-4s) + OSNAME="FreeBSD 13.4-STABLE" + OSv="freebsd13.0" + URLzs="$FREEBSD/amd64-freebsd-13.4-STABLE.qcow2.zst" + BASH="/usr/local/bin/bash" + ;; + freebsd14-1s) OSNAME="FreeBSD 14.1-STABLE" OSv="freebsd14.0" URLzs="$FREEBSD/amd64-freebsd-14.1-STABLE.qcow2.zst" BASH="/usr/local/bin/bash" ;; - freebsd15) + freebsd15-0c) OSNAME="FreeBSD 15.0-CURRENT" OSv="freebsd14.0" URLzs="$FREEBSD/amd64-freebsd-15.0-CURRENT.qcow2.zst" diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index 8922701f989..f819e9938e3 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -22,8 +22,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora39", "fedora40", "freebsd13", "freebsd13r", "freebsd14", "freebsd14r", "ubuntu20", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora40", "freebsd13", "freebsd14", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora39", "fedora40", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora40", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -46,10 +46,12 @@ jobs: strategy: fail-fast: false matrix: - # all: - # os: [almalinux8, almalinux9, archlinux, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24] - # openzfs: - # os: [almalinux8, almalinux9, centos-stream9, debian11, debian12, fedora39, fedora40, freebsd13, freebsd13r, freebsd14, freebsd14r, ubuntu20, ubuntu22, ubuntu24] + # rhl: almalinux8, almalinux9, centos-stream9, fedora39, fedora40 + # debian: debian11, debian12, ubuntu20, ubuntu22, ubuntu24 + # misc: archlinux, tumbleweed + # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r + # FreeBSD Stable: freebsd13-4s, freebsd14-1s + # FreeBSD Current: freebsd15-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 steps: From 995a3a61fdcf6b6b4479f073c19271854c48de6f Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sun, 6 Oct 2024 23:32:08 +0200 Subject: [PATCH 03/32] ZTS: Fix summary page creation again - second try In PR #16599 I used 'return' like in C - which is wrong :/ This fix generates the summary as needed. Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #16611 --- .github/workflows/scripts/qemu-9-summary-page.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-9-summary-page.sh b/.github/workflows/scripts/qemu-9-summary-page.sh index cae287ac107..737dda01b56 100755 --- a/.github/workflows/scripts/qemu-9-summary-page.sh +++ b/.github/workflows/scripts/qemu-9-summary-page.sh @@ -11,12 +11,10 @@ function output() { } function outfile() { - test -s "$1" || return cat "$1" >> "out-$logfile.md" } function outfile_plain() { - test -s "$1" || return output "
"
   cat "$1" >> "out-$logfile.md"
   output "
" @@ -45,6 +43,8 @@ if [ ! -f out-1.md ]; then tar xf "$tarfile" test -s env.txt || continue source env.txt + # when uname.txt is there, the other files are also ok + test -s uname.txt || continue output "\n## Functional Tests: $OSNAME\n" outfile_plain uname.txt outfile_plain summary.txt From 0b4dcbe5b425be70bc10d836d8e66711c690e8d4 Mon Sep 17 00:00:00 2001 From: JKDingwall Date: Sun, 6 Oct 2024 22:36:33 +0100 Subject: [PATCH 04/32] Fix generation of kernel uevents for snapshot rename on linux `zvol_rename_minors()` needs to be given the full path not just the snapshot name. Use code removed in a0bd735ad as a guide to providing the necessary values. Add ZTS check for /dev changes after snapshot rename. After renaming a snapshot with 'snapdev=visible' ensure that the /dev entries are updated to reflect the rename. Reviewed-by: Brian Behlendorf Signed-off-by: James Dingwall Closes #14223 Closes #16600 --- module/zfs/dsl_dataset.c | 11 +++++++++-- .../functional/zvol/zvol_misc/zvol_misc_snapdev.ksh | 13 +++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 6a9ed891093..2248f644bee 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -2987,6 +2987,7 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, dsl_dataset_t *ds; uint64_t val; dmu_tx_t *tx = ddrsa->ddrsa_tx; + char *oldname, *newname; int error; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); @@ -3011,8 +3012,14 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); - zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname, - ddrsa->ddrsa_newsnapname, B_TRUE); + + oldname = kmem_asprintf("%s@%s", ddrsa->ddrsa_fsname, + ddrsa->ddrsa_oldsnapname); + newname = kmem_asprintf("%s@%s", ddrsa->ddrsa_fsname, + ddrsa->ddrsa_newsnapname); + zvol_rename_minors(dp->dp_spa, oldname, newname, B_TRUE); + kmem_strfree(oldname); + kmem_strfree(newname); dsl_dataset_rele(ds, FTAG); return (0); diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh index 1fc2d2780b1..af780b628ce 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh @@ -117,5 +117,18 @@ log_must zfs set snapdev=visible $TESTPOOL verify_inherited 'snapdev' 'hidden' $SUBZVOL $VOLFS blockdev_missing $SUBSNAPDEV blockdev_exists $SNAPDEV +log_must zfs destroy $SNAP + +# 4. Verify "rename" is correctly reflected when "snapdev=visible" +# 4.1 First create a snapshot and verify the device is present +log_must zfs snapshot $SNAP +log_must zfs set snapdev=visible $ZVOL +blockdev_exists $SNAPDEV +# 4.2 rename the snapshot and verify the devices are updated +log_must zfs rename $SNAP $SNAP-new +blockdev_missing $SNAPDEV +blockdev_exists $SNAPDEV-new +# 4.3 cleanup +log_must zfs destroy $SNAP-new log_pass "ZFS volume property 'snapdev' works as expected" From c59d5495fe171aac752ab5bf8b7be325a2ef861d Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Sat, 5 Oct 2024 20:39:21 +0800 Subject: [PATCH 05/32] contrib/debian: add new manpages to installation list Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #16609 --- contrib/debian/openzfs-zfsutils.install | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index d51e4ef003e..546745930bf 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -98,6 +98,7 @@ usr/share/man/man8/zpool-attach.8 usr/share/man/man8/zpool-checkpoint.8 usr/share/man/man8/zpool-clear.8 usr/share/man/man8/zpool-create.8 +usr/share/man/man8/zpool-ddtprune.8 usr/share/man/man8/zpool-destroy.8 usr/share/man/man8/zpool-detach.8 usr/share/man/man8/zpool-ddtprune.8 @@ -113,6 +114,7 @@ usr/share/man/man8/zpool-list.8 usr/share/man/man8/zpool-offline.8 usr/share/man/man8/zpool-online.8 usr/share/man/man8/zpool-prefetch.8 +usr/share/man/man8/zpool-prefetch.8 usr/share/man/man8/zpool-reguid.8 usr/share/man/man8/zpool-remove.8 usr/share/man/man8/zpool-reopen.8 From e8f0aa143e1b25d98624d4b2623a6e0fc42afb9e Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Sat, 5 Oct 2024 20:29:37 +0800 Subject: [PATCH 06/32] Bump SONAME of libzfs and libzpool The ABI of libzfs and libzpool have breaking changes since last SONAME bump in commit fe6babc: * libzfs: `zpool_print_unsup_feat` removed (used by zpool cmd). * libzpool: multiple `ddt_*` symbols removed (used by zdb cmd). Bump them to avoid ABI breakage. See: https://github.com/openzfs/zfs/pull/11817 Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #16609 --- config/deb.am | 8 ++-- contrib/debian/Makefile.am | 8 ++-- contrib/debian/clean | 4 +- contrib/debian/control | 26 +++++------ ...nzfs-libzfs4.docs => openzfs-libzfs6.docs} | 0 ....install.in => openzfs-libzfs6.install.in} | 0 ...-libzpool5.docs => openzfs-libzpool6.docs} | 0 ...nstall.in => openzfs-libzpool6.install.in} | 0 lib/libzfs/Makefile.am | 2 +- lib/libzfs/libzfs.abi | 2 +- lib/libzpool/Makefile.am | 2 +- rpm/generic/zfs.spec.in | 45 ++++++++++--------- 12 files changed, 50 insertions(+), 47 deletions(-) rename contrib/debian/{openzfs-libzfs4.docs => openzfs-libzfs6.docs} (100%) rename contrib/debian/{openzfs-libzfs4.install.in => openzfs-libzfs6.install.in} (100%) rename contrib/debian/{openzfs-libzpool5.docs => openzfs-libzpool6.docs} (100%) rename contrib/debian/{openzfs-libzpool5.install.in => openzfs-libzpool6.install.in} (100%) diff --git a/config/deb.am b/config/deb.am index 4d86a1b7061..39467aa425e 100644 --- a/config/deb.am +++ b/config/deb.am @@ -58,9 +58,9 @@ deb-utils: deb-local rpm-utils-initramfs pkg1=$${name}-$${version}.$${arch}.rpm; \ pkg2=libnvpair3-$${version}.$${arch}.rpm; \ pkg3=libuutil3-$${version}.$${arch}.rpm; \ - pkg4=libzfs5-$${version}.$${arch}.rpm; \ - pkg5=libzpool5-$${version}.$${arch}.rpm; \ - pkg6=libzfs5-devel-$${version}.$${arch}.rpm; \ + pkg4=libzfs6-$${version}.$${arch}.rpm; \ + pkg5=libzpool6-$${version}.$${arch}.rpm; \ + pkg6=libzfs6-devel-$${version}.$${arch}.rpm; \ pkg7=$${name}-test-$${version}.$${arch}.rpm; \ pkg8=$${name}-dracut-$${version}.noarch.rpm; \ pkg9=$${name}-initramfs-$${version}.$${arch}.rpm; \ @@ -72,7 +72,7 @@ deb-utils: deb-local rpm-utils-initramfs path_prepend=`mktemp -d /tmp/intercept.XXXXXX`; \ echo "#!$(SHELL)" > $${path_prepend}/dh_shlibdeps; \ echo "`which dh_shlibdeps` -- \ - -xlibuutil3linux -xlibnvpair3linux -xlibzfs5linux -xlibzpool5linux" \ + -xlibuutil3linux -xlibnvpair3linux -xlibzfs6linux -xlibzpool6linux" \ >> $${path_prepend}/dh_shlibdeps; \ ## These -x arguments are passed to dpkg-shlibdeps, which exclude the ## Debianized packages from the auto-generated dependencies of the new debs, diff --git a/contrib/debian/Makefile.am b/contrib/debian/Makefile.am index f76b59645ea..99d512312df 100644 --- a/contrib/debian/Makefile.am +++ b/contrib/debian/Makefile.am @@ -12,14 +12,14 @@ dist_noinst_DATA += %D%/openzfs-libpam-zfs.postinst dist_noinst_DATA += %D%/openzfs-libpam-zfs.prerm dist_noinst_DATA += %D%/openzfs-libuutil3.docs dist_noinst_DATA += %D%/openzfs-libuutil3.install.in -dist_noinst_DATA += %D%/openzfs-libzfs4.docs -dist_noinst_DATA += %D%/openzfs-libzfs4.install.in +dist_noinst_DATA += %D%/openzfs-libzfs6.docs +dist_noinst_DATA += %D%/openzfs-libzfs6.install.in dist_noinst_DATA += %D%/openzfs-libzfsbootenv1.docs dist_noinst_DATA += %D%/openzfs-libzfsbootenv1.install.in dist_noinst_DATA += %D%/openzfs-libzfs-dev.docs dist_noinst_DATA += %D%/openzfs-libzfs-dev.install.in -dist_noinst_DATA += %D%/openzfs-libzpool5.docs -dist_noinst_DATA += %D%/openzfs-libzpool5.install.in +dist_noinst_DATA += %D%/openzfs-libzpool6.docs +dist_noinst_DATA += %D%/openzfs-libzpool6.install.in dist_noinst_DATA += %D%/openzfs-python3-pyzfs.install dist_noinst_DATA += %D%/openzfs-zfs-dkms.config dist_noinst_DATA += %D%/openzfs-zfs-dkms.dkms diff --git a/contrib/debian/clean b/contrib/debian/clean index 3100d693aeb..4f52d01b810 100644 --- a/contrib/debian/clean +++ b/contrib/debian/clean @@ -6,6 +6,6 @@ contrib/pyzfs/libzfs_core/bindings/__pycache__/ contrib/pyzfs/pyzfs.egg-info/ debian/openzfs-libnvpair3.install debian/openzfs-libuutil3.install -debian/openzfs-libzfs4.install +debian/openzfs-libzfs6.install debian/openzfs-libzfs-dev.install -debian/openzfs-libzpool5.install +debian/openzfs-libzpool6.install diff --git a/contrib/debian/control b/contrib/debian/control index e56fbf0f1c9..6829c0ccdf9 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -78,9 +78,9 @@ Architecture: linux-any Depends: libssl-dev | libssl1.0-dev, openzfs-libnvpair3 (= ${binary:Version}), openzfs-libuutil3 (= ${binary:Version}), - openzfs-libzfs4 (= ${binary:Version}), + openzfs-libzfs6 (= ${binary:Version}), openzfs-libzfsbootenv1 (= ${binary:Version}), - openzfs-libzpool5 (= ${binary:Version}), + openzfs-libzpool6 (= ${binary:Version}), ${misc:Depends} Replaces: libzfslinux-dev Conflicts: libzfslinux-dev @@ -90,18 +90,18 @@ Description: OpenZFS filesystem development files for Linux libraries of OpenZFS filesystem. . This package includes the development files of libnvpair3, libuutil3, - libzpool5 and libzfs4. + libzpool6 and libzfs6. -Package: openzfs-libzfs4 +Package: openzfs-libzfs6 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} # The libcurl4 is loaded through dlopen("libcurl.so.4"). # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988521 Recommends: libcurl4 -Breaks: libzfs2, libzfs4 -Replaces: libzfs2, libzfs4, libzfs4linux -Conflicts: libzfs4linux +Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux +Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux +Conflicts: libzfs6linux Description: OpenZFS filesystem library for Linux - general support OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -123,13 +123,13 @@ Description: OpenZFS filesystem library for Linux - label info support . The zfsbootenv library provides support for modifying ZFS label information. -Package: openzfs-libzpool5 +Package: openzfs-libzpool6 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} -Breaks: libzpool2, libzpool5 -Replaces: libzpool2, libzpool5, libzpool5linux -Conflicts: libzpool5linux +Breaks: libzpool2, libzpool5, libzpool5linux, libzpool6linux +Replaces: libzpool2, libzpool5, libzpool5linux, libzpool6linux +Conflicts: libzpool6linux Description: OpenZFS pool library for Linux OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -246,8 +246,8 @@ Architecture: linux-any Pre-Depends: ${misc:Pre-Depends} Depends: openzfs-libnvpair3 (= ${binary:Version}), openzfs-libuutil3 (= ${binary:Version}), - openzfs-libzfs4 (= ${binary:Version}), - openzfs-libzpool5 (= ${binary:Version}), + openzfs-libzfs6 (= ${binary:Version}), + openzfs-libzpool6 (= ${binary:Version}), python3, ${misc:Depends}, ${shlibs:Depends} diff --git a/contrib/debian/openzfs-libzfs4.docs b/contrib/debian/openzfs-libzfs6.docs similarity index 100% rename from contrib/debian/openzfs-libzfs4.docs rename to contrib/debian/openzfs-libzfs6.docs diff --git a/contrib/debian/openzfs-libzfs4.install.in b/contrib/debian/openzfs-libzfs6.install.in similarity index 100% rename from contrib/debian/openzfs-libzfs4.install.in rename to contrib/debian/openzfs-libzfs6.install.in diff --git a/contrib/debian/openzfs-libzpool5.docs b/contrib/debian/openzfs-libzpool6.docs similarity index 100% rename from contrib/debian/openzfs-libzpool5.docs rename to contrib/debian/openzfs-libzpool6.docs diff --git a/contrib/debian/openzfs-libzpool5.install.in b/contrib/debian/openzfs-libzpool6.install.in similarity index 100% rename from contrib/debian/openzfs-libzpool5.install.in rename to contrib/debian/openzfs-libzpool6.install.in diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index a976faaf991..5f8963dccd1 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -70,7 +70,7 @@ if BUILD_FREEBSD libzfs_la_LIBADD += -lutil -lgeom endif -libzfs_la_LDFLAGS += -version-info 5:0:1 +libzfs_la_LDFLAGS += -version-info 6:0:0 pkgconfig_DATA += %D%/libzfs.pc diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 1a96460c2b8..ac9ae233c72 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,4 +1,4 @@ - + diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 397959c679e..404b737c204 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -212,7 +212,7 @@ if BUILD_FREEBSD libzpool_la_LIBADD += -lgeom endif -libzpool_la_LDFLAGS += -version-info 5:0:0 +libzpool_la_LDFLAGS += -version-info 6:0:0 if TARGET_CPU_POWERPC module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.$(OBJEXT) : CFLAGS += -maltivec diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index c7a00c61f6b..d0d850af262 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -99,10 +99,10 @@ License: @ZFS_META_LICENSE@ URL: https://github.com/openzfs/zfs Source0: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -Requires: libzpool5%{?_isa} = %{version}-%{release} +Requires: libzpool6%{?_isa} = %{version}-%{release} Requires: libnvpair3%{?_isa} = %{version}-%{release} Requires: libuutil3%{?_isa} = %{version}-%{release} -Requires: libzfs5%{?_isa} = %{version}-%{release} +Requires: libzfs6%{?_isa} = %{version}-%{release} Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version}-%{release} Obsoletes: spl <= %{version} @@ -150,21 +150,22 @@ Requires: sysstat %description This package contains the core ZFS command line utilities. -%package -n libzpool5 +%package -n libzpool6 Summary: Native ZFS pool library for Linux Group: System Environment/Kernel Obsoletes: libzpool2 <= %{version} Obsoletes: libzpool4 <= %{version} +Obsoletes: libzpool5 <= %{version} -%description -n libzpool5 +%description -n libzpool6 This package contains the zpool library, which provides support for managing zpools %if %{defined ldconfig_scriptlets} -%ldconfig_scriptlets -n libzpool5 +%ldconfig_scriptlets -n libzpool6 %else -%post -n libzpool5 -p /sbin/ldconfig -%postun -n libzpool5 -p /sbin/ldconfig +%post -n libzpool6 -p /sbin/ldconfig +%postun -n libzpool6 -p /sbin/ldconfig %endif %package -n libnvpair3 @@ -211,37 +212,39 @@ This library provides a variety of compatibility functions for OpenZFS: # The library version is encoded in the package name. When updating the # version information it is important to add an obsoletes line below for # the previous version of the package. -%package -n libzfs5 +%package -n libzfs6 Summary: Native ZFS filesystem library for Linux Group: System Environment/Kernel Obsoletes: libzfs2 <= %{version} Obsoletes: libzfs4 <= %{version} +Obsoletes: libzfs5 <= %{version} -%description -n libzfs5 +%description -n libzfs6 This package provides support for managing ZFS filesystems %if %{defined ldconfig_scriptlets} -%ldconfig_scriptlets -n libzfs5 +%ldconfig_scriptlets -n libzfs6 %else -%post -n libzfs5 -p /sbin/ldconfig -%postun -n libzfs5 -p /sbin/ldconfig +%post -n libzfs6 -p /sbin/ldconfig +%postun -n libzfs6 -p /sbin/ldconfig %endif -%package -n libzfs5-devel +%package -n libzfs6-devel Summary: Development headers Group: System Environment/Kernel -Requires: libzfs5%{?_isa} = %{version}-%{release} -Requires: libzpool5%{?_isa} = %{version}-%{release} +Requires: libzfs6%{?_isa} = %{version}-%{release} +Requires: libzpool6%{?_isa} = %{version}-%{release} Requires: libnvpair3%{?_isa} = %{version}-%{release} Requires: libuutil3%{?_isa} = %{version}-%{release} -Provides: libzpool5-devel = %{version}-%{release} +Provides: libzpool6-devel = %{version}-%{release} Provides: libnvpair3-devel = %{version}-%{release} Provides: libuutil3-devel = %{version}-%{release} Obsoletes: zfs-devel <= %{version} Obsoletes: libzfs2-devel <= %{version} Obsoletes: libzfs4-devel <= %{version} +Obsoletes: libzfs5-devel <= %{version} -%description -n libzfs5-devel +%description -n libzfs6-devel This package contains the header files needed for building additional applications against the ZFS libraries. @@ -290,7 +293,7 @@ Summary: Python %{python_version} wrapper for libzfs_core Group: Development/Languages/Python License: Apache-2.0 BuildArch: noarch -Requires: libzfs5 = %{version}-%{release} +Requires: libzfs6 = %{version}-%{release} Requires: libnvpair3 = %{version}-%{release} Requires: libffi Requires: python%{__python_pkg_version} @@ -534,7 +537,7 @@ systemctl --system daemon-reload >/dev/null || true %config(noreplace) %{_bashcompletiondir}/zfs %config(noreplace) %{_bashcompletiondir}/zpool -%files -n libzpool5 +%files -n libzpool6 %{_libdir}/libzpool.so.* %files -n libnvpair3 @@ -543,10 +546,10 @@ systemctl --system daemon-reload >/dev/null || true %files -n libuutil3 %{_libdir}/libuutil.so.* -%files -n libzfs5 +%files -n libzfs6 %{_libdir}/libzfs*.so.* -%files -n libzfs5-devel +%files -n libzfs6-devel %{_pkgconfigdir}/libzfs.pc %{_pkgconfigdir}/libzfsbootenv.pc %{_pkgconfigdir}/libzfs_core.pc From ab777f436ce228559f3c2480ebe8a6b5ab888c43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Mon, 7 Oct 2024 19:31:46 +0200 Subject: [PATCH 07/32] Return boolean_t in inline functions of lib/libspl/include/sys/uio.h The inline functions zfs_dio_offset_aligned(), zfs_dio_size_aligned() and zfs_dio_aligned() are declared as boolean_t but return the bool type. This fixes the build of FreeBSD. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #16613 --- lib/libspl/include/sys/uio.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index 2cb0107d58f..16749fa492e 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -92,20 +92,20 @@ zfs_dio_page_aligned(void *buf) static inline boolean_t zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) { - return (IS_P2ALIGNED(offset, blksz)); + return ((IS_P2ALIGNED(offset, blksz)) ? B_TRUE : B_FALSE); } static inline boolean_t zfs_dio_size_aligned(uint64_t size, uint64_t blksz) { - return ((size % blksz) == 0); + return (((size % blksz) == 0) ? B_TRUE : B_FALSE); } static inline boolean_t zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz) { - return (zfs_dio_offset_aligned(offset, blksz) && - zfs_dio_size_aligned(size, blksz)); + return ((zfs_dio_offset_aligned(offset, blksz) && + zfs_dio_size_aligned(size, blksz)) ? B_TRUE : B_FALSE); } static inline void From ca0141f325ec706d38a06f9aeb8e5eb6c6a8d09a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 8 Oct 2024 07:09:08 +1100 Subject: [PATCH 08/32] zpool/zfs: restore -V & --version options The -j option added a round of getopt, which didn't know the magic version flags. So just bypass the whole thing and go straight to the human output function for the special case. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Umer Saleem Signed-off-by: Rob Norris Closes #16615 Closes #16617 --- cmd/zfs/zfs_main.c | 2 +- cmd/zpool/zpool_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 51a0fbd43c1..f979fd189cc 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -9187,7 +9187,7 @@ main(int argc, char **argv) * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) - return (zfs_do_version(argc, argv)); + return (zfs_version_print() != 0); /* * Special case 'help' diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index aa7da68aa68..bc9f90cae08 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -13613,7 +13613,7 @@ main(int argc, char **argv) * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) - return (zpool_do_version(argc, argv)); + return (zfs_version_print() != 0); /* * Special case 'help' From 65a94ffa80466246f37322393c3b5bd460cb8987 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 8 Oct 2024 21:27:38 +0500 Subject: [PATCH 09/32] Only serialize native-deb* targets .NOTPARALLEL target is being forced on userspace as well. This commit removes .NOTPARALEL target and only serializes the execution of native-deb* targets. Reviewed-by: Brian Behlendorf Signed-off-by: Umer Saleem Closes #16622 --- config/deb.am | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/config/deb.am b/config/deb.am index 39467aa425e..9e58e1905b7 100644 --- a/config/deb.am +++ b/config/deb.am @@ -93,13 +93,17 @@ debian: cp -r contrib/debian debian; chmod +x debian/rules; native-deb-utils: native-deb-local debian + while [ -f debian/deb-build.lock ]; do sleep 1; done; \ + echo "native-deb-utils" > debian/deb-build.lock; \ cp contrib/debian/control debian/control; \ - $(DPKGBUILD) -b -rfakeroot -us -uc; + $(DPKGBUILD) -b -rfakeroot -us -uc; \ + $(RM) -f debian/deb-build.lock native-deb-kmod: native-deb-local debian + while [ -f debian/deb-build.lock ]; do sleep 1; done; \ + echo "native-deb-kmod" > debian/deb-build.lock; \ sh scripts/make_gitrev.sh; \ - fakeroot debian/rules override_dh_binary-modules; + fakeroot debian/rules override_dh_binary-modules; \ + $(RM) -f debian/deb-build.lock native-deb: native-deb-utils native-deb-kmod - -.NOTPARALLEL: native-deb native-deb-utils native-deb-kmod From 75dda92dc36a9265a25e66fb38284620644d4c61 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 8 Oct 2024 09:35:13 -0700 Subject: [PATCH 10/32] ZTS: resilver_restart_001.ksh restore defaults Update resilver_restart_001.ksh to restore the default resilver_defer_percent when the test completes. Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Reviewed-by: Pavel Snajdr Signed-off-by: Brian Behlendorf Closes #16618 --- .../tests/functional/replacement/resilver_restart_001.ksh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh index 629870adf9e..101fa410bdb 100755 --- a/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh @@ -48,6 +48,8 @@ function cleanup log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ $ORIG_SCAN_SUSPEND_PROGRESS + log_must set_tunable32 RESILVER_DEFER_PERCENT \ + $ORIG_RESILVER_DEFER_PERCENT log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX log_must zinject -c all destroy_pool $TESTPOOL1 @@ -90,6 +92,7 @@ log_assert "Check for unnecessary resilver restarts" ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS) ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) +ORIG_RESILVER_DEFER_PERCENT=$(get_tunable RESILVER_DEFER_PERCENT) ORIG_ZFS_ZEVENT_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) set -A RESTARTS -- '1' '2' '2' '2' From cefef28e9895fae01e294c0e8075b8bf98037acd Mon Sep 17 00:00:00 2001 From: Matthew Heller Date: Tue, 8 Oct 2024 20:43:04 -0400 Subject: [PATCH 11/32] vdev_id: multi-lun disks & slot num zero pad Add ability to generate disk names that contain both a slot number and a lun number in order to support multi-actuator SAS hard drives with multiple luns. Also add the ability to zero pad slot numbers to a desired digit length for easier sorting. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Heller Closes #16603 --- man/man5/vdev_id.conf.5 | 10 +++++++++- udev/vdev_id | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/man/man5/vdev_id.conf.5 b/man/man5/vdev_id.conf.5 index a2d38add4ee..aaf91825a6c 100644 --- a/man/man5/vdev_id.conf.5 +++ b/man/man5/vdev_id.conf.5 @@ -92,6 +92,11 @@ before a generic mapping for the same slot. In this way a custom mapping may be applied to a particular channel and a default mapping applied to the others. . +.It Sy zpad_slot Ar digits +Pad slot numbers with zeros to make them +.Ar digits +long, which can help to make disk names a consistent length and easier to sort. +. .It Sy multipath Sy yes Ns | Ns Sy no Specifies whether .Xr vdev_id 8 @@ -122,7 +127,7 @@ device is connected to. The default is .Sy 4 . . -.It Sy slot Sy bay Ns | Ns Sy phy Ns | Ns Sy port Ns | Ns Sy id Ns | Ns Sy lun Ns | Ns Sy ses +.It Sy slot Sy bay Ns | Ns Sy phy Ns | Ns Sy port Ns | Ns Sy id Ns | Ns Sy lun Ns | Ns Sy bay_lun Ns | Ns Sy ses Specifies from which element of a SAS identifier the slot number is taken. The default is @@ -138,6 +143,9 @@ use the SAS port as the slot number. use the scsi id as the slot number. .It Sy lun use the scsi lun as the slot number. +.It Sy bay_lun +read the slot number from the bay identifier and append the lun number. +Useful for multi-lun multi-actuator hard drives. .It Sy ses use the SCSI Enclosure Services (SES) enclosure device slot number, as reported by diff --git a/udev/vdev_id b/udev/vdev_id index 7b5aab14199..5897fea3ed6 100755 --- a/udev/vdev_id +++ b/udev/vdev_id @@ -124,6 +124,7 @@ TOPOLOGY= BAY= ENCL_ID="" UNIQ_ENCL_ID="" +ZPAD=1 usage() { cat << EOF @@ -154,7 +155,7 @@ map_slot() { if [ -z "$MAPPED_SLOT" ] ; then MAPPED_SLOT=$LINUX_SLOT fi - printf "%d" "${MAPPED_SLOT}" + printf "%0${ZPAD}d" "${MAPPED_SLOT}" } map_channel() { @@ -430,6 +431,15 @@ sas_handler() { d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; + "bay_lun") + # Like 'bay' but with the LUN number appened. Added for SAS + # multi-actuator HDDs, where one physical drive has multiple + # LUNs, thus multiple logical drives share the same bay number + i=$((i + 2)) + d=$(eval echo '$'{$i}) + LUN="-lun$(echo "$d" | sed -e 's/^.*://')" + SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) + ;; "ses") # look for this SAS path in all SCSI Enclosure Services # (SES) enclosures @@ -460,7 +470,7 @@ sas_handler() { if [ -z "$CHAN" ] ; then return fi - echo "${CHAN}"-"${JBOD}"-"${SLOT}${PART}" + echo "${CHAN}"-"${JBOD}"-"${SLOT}${LUN}${PART}" else CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") @@ -468,7 +478,7 @@ sas_handler() { if [ -z "$CHAN" ] ; then return fi - echo "${CHAN}${SLOT}${PART}" + echo "${CHAN}${SLOT}${LUN}${PART}" fi } @@ -748,6 +758,8 @@ if [ -z "$BAY" ] ; then BAY=$(awk '($1 == "slot") {print $2; exit}' "$CONFIG") fi +ZPAD=$(awk '($1 == "zpad_slot") {print $2; exit}' "$CONFIG") + TOPOLOGY=${TOPOLOGY:-sas_direct} # Should we create /dev/by-enclosure symlinks? From 4319e714022ace9e74659c15be0af3cd2f22d31a Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 8 Oct 2024 20:41:17 -0700 Subject: [PATCH 12/32] ztest: Fix scrub check in ztest_raidz_expand_check() The scrub code may return EBUSY under several possible scenarios causing ztest to incorrectly ASSERT when verifying the result of a raidz expansion. Update the test case to allow EBUSY since it does not indicate pool damage. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #16627 --- cmd/ztest.c | 19 ++++++++++++++++++- lib/libzfs/libzfs_pool.c | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 523f280aae1..4a7959ebfca 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -6717,6 +6717,17 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). + * + * EBUSY may be returned for the following six cases. It's the callers + * responsibility to handle them accordingly. + * + * Current state Requested + * 1. Normal Scrub Running Normal Scrub or Error Scrub + * 2. Normal Scrub Paused Error Scrub + * 3. Normal Scrub Paused Pause Normal Scrub + * 4. Error Scrub Running Normal Scrub or Error Scrub + * 5. Error Scrub Paused Pause Error Scrub + * 6. Resilvering Anything else */ static int ztest_scrub_impl(spa_t *spa) @@ -8082,8 +8093,14 @@ ztest_raidz_expand_check(spa_t *spa) (void) printf("verifying an interrupted raidz " "expansion using a pool scrub ...\n"); } + /* Will fail here if there is non-recoverable corruption detected */ - VERIFY0(ztest_scrub_impl(spa)); + int error = ztest_scrub_impl(spa); + if (error == EBUSY) + error = 0; + + VERIFY0(error); + if (ztest_opts.zo_verbose >= 1) { (void) printf("raidz expansion scrub check complete\n"); } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 14410b15313..44f2c6f19df 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2796,7 +2796,7 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) } /* - * With EBUSY, five cases are possible: + * With EBUSY, six cases are possible: * * Current state Requested * 1. Normal Scrub Running Normal Scrub or Error Scrub From efeb60b86a22b4d58c9eaf73c862bbe0cd3a7fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Wed, 9 Oct 2024 18:27:46 +0200 Subject: [PATCH 13/32] FreeBSD: ignore some includes when not building kernel The function abd_alloc_from_pages() is used only in kernel. Excluding sys/vm.h, and vm/vm_page.h includes avoids dependency problems. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #16616 --- include/os/freebsd/zfs/sys/abd_os.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/os/freebsd/zfs/sys/abd_os.h b/include/os/freebsd/zfs/sys/abd_os.h index be825b3b8a4..401c8452d6b 100644 --- a/include/os/freebsd/zfs/sys/abd_os.h +++ b/include/os/freebsd/zfs/sys/abd_os.h @@ -26,8 +26,10 @@ #ifndef _ABD_OS_H #define _ABD_OS_H +#ifdef _KERNEL #include #include +#endif #ifdef __cplusplus extern "C" { @@ -47,8 +49,10 @@ struct abd_linear { #endif }; +#ifdef _KERNEL __attribute__((malloc)) struct abd *abd_alloc_from_pages(vm_page_t *, unsigned long, uint64_t); +#endif #ifdef __cplusplus } From b4e4cbeb20240cc7a780fb0e4bebd0134701fee8 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Wed, 9 Oct 2024 15:28:08 -0400 Subject: [PATCH 14/32] Always validate checksums for Direct I/O reads This fixes an oversight in the Direct I/O PR. There is nothing that stops a process from manipulating the contents of a buffer for a Direct I/O read while the I/O is in flight. This can lead checksum verify failures. However, the disk contents are still correct, and this would lead to false reporting of checksum validation failures. To remedy this, all Direct I/O reads that have a checksum verification failure are treated as suspicious. In the event a checksum validation failure occurs for a Direct I/O read, then the I/O request will be reissued though the ARC. This allows for actual validation to happen and removes any possibility of the buffer being manipulated after the I/O has been issued. Just as with Direct I/O write checksum validation failures, Direct I/O read checksum validation failures are reported though zpool status -d in the DIO column. Also the zevent has been updated to have both: 1. dio_verify_wr -> Checksum verification failure for writes 2. dio_verify_rd -> Checksum verification failure for reads. This allows for determining what I/O operation was the culprit for the checksum verification failure. All DIO errors are reported only on the top-level VDEV. Even though FreeBSD can write protect pages (stable pages) it still has the same issue as Linux with Direct I/O reads. This commit updates the following: 1. Propogates checksum failures for reads all the way up to the top-level VDEV. 2. Reports errors through zpool status -d as DIO. 3. Has two zevents for checksum verify errors with Direct I/O. One for read and one for write. 4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and handle ABD buffer contents validation the same as Linux. 5. Updated manipulate_user_buffer.c to also manipulate a buffer while a Direct I/O read is taking place. 6. Adds a new ZTS test case dio_read_verify that stress tests the new code. 7. Updated man pages. 8. Added an IMPLY statement to zio_checksum_verify() to make sure that Direct I/O reads are not issued as speculative. 9. Removed self healing through mirror, raidz, and dRAID VDEVs for Direct I/O reads. This issue was first observed when installing a Windows 11 VM on a ZFS dataset with the dataset property direct set to always. The zpool devices would report checksum failures, but running a subsequent zpool scrub would not repair any data and report no errors. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Brian Atkinson Closes #16598 --- cmd/zpool/zpool_main.c | 6 + include/sys/fm/fs/zfs.h | 3 +- include/sys/vdev_raidz.h | 2 +- include/sys/zio.h | 29 +-- man/man4/zfs.4 | 2 +- man/man8/zpool-events.8 | 5 +- man/man8/zpool-status.8 | 8 +- module/os/freebsd/zfs/abd_os.c | 41 +++- module/os/linux/zfs/abd_os.c | 4 +- module/zcommon/zfs_valstr.c | 1 + module/zfs/dmu_direct.c | 2 +- module/zfs/vdev_draid.c | 2 +- module/zfs/vdev_indirect.c | 14 ++ module/zfs/vdev_mirror.c | 21 ++ module/zfs/vdev_raidz.c | 44 ++++- module/zfs/zfs_vnops.c | 26 ++- module/zfs/zio.c | 120 ++++++++---- tests/runfiles/common.run | 4 +- tests/zfs-tests/cmd/manipulate_user_buffer.c | 180 ++++++++++++------ tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/direct/dio.kshlib | 10 +- .../functional/direct/dio_read_verify.ksh | 107 +++++++++++ .../direct/dio_write_stable_pages.ksh | 6 +- .../functional/direct/dio_write_verify.ksh | 18 +- 24 files changed, 510 insertions(+), 146 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index bc9f90cae08..09a6c604328 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -9224,6 +9224,12 @@ vdev_stats_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, } } + if (cb->cb_print_dio_verify) { + nice_num_str_nvlist(vds, "dio_verify_errors", + vs->vs_dio_verify_errors, cb->cb_literal, + cb->cb_json_as_int, ZFS_NICENUM_1024); + } + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { nice_num_str_nvlist(vds, ZPOOL_CONFIG_NOT_PRESENT, diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 55b150c044e..43d6e3f96ea 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -42,7 +42,8 @@ extern "C" { #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DEADMAN "deadman" -#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify" +#define FM_EREPORT_ZFS_DIO_VERIFY_WR "dio_verify_wr" +#define FM_EREPORT_ZFS_DIO_VERIFY_RD "dio_verify_rd" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index a34bc00ca4d..64f484e9aa1 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -57,7 +57,7 @@ void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_child_done(zio_t *); void vdev_raidz_io_done(zio_t *); void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); -struct raidz_row *vdev_raidz_row_alloc(int); +struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); diff --git a/include/sys/zio.h b/include/sys/zio.h index f9409433e03..46f5d68aed4 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -208,25 +208,25 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_PROBE (1ULL << 16) #define ZIO_FLAG_TRYHARD (1ULL << 17) #define ZIO_FLAG_OPTIONAL (1ULL << 18) - +#define ZIO_FLAG_DIO_READ (1ULL << 19) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ -#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */ -#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20) -#define ZIO_FLAG_IO_BYPASS (1ULL << 21) -#define ZIO_FLAG_IO_REWRITE (1ULL << 22) -#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23) -#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24) -#define ZIO_FLAG_GANG_CHILD (1ULL << 25) -#define ZIO_FLAG_DDT_CHILD (1ULL << 26) -#define ZIO_FLAG_GODFATHER (1ULL << 27) -#define ZIO_FLAG_NOPWRITE (1ULL << 28) -#define ZIO_FLAG_REEXECUTED (1ULL << 29) -#define ZIO_FLAG_DELEGATED (1ULL << 30) -#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31) +#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */ +#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21) +#define ZIO_FLAG_IO_BYPASS (1ULL << 22) +#define ZIO_FLAG_IO_REWRITE (1ULL << 23) +#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24) +#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25) +#define ZIO_FLAG_GANG_CHILD (1ULL << 26) +#define ZIO_FLAG_DDT_CHILD (1ULL << 27) +#define ZIO_FLAG_GODFATHER (1ULL << 28) +#define ZIO_FLAG_NOPWRITE (1ULL << 29) +#define ZIO_FLAG_REEXECUTED (1ULL << 30) +#define ZIO_FLAG_DELEGATED (1ULL << 31) +#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -647,6 +647,7 @@ extern void zio_vdev_io_redone(zio_t *zio); extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); +extern void zio_dio_chksum_verify_error_report(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e19573d5ee1..c9f6ed0dece 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -436,7 +436,7 @@ write. It can also help to identify if reported checksum errors are tied to Direct I/O writes. Each verify error causes a -.Sy dio_verify +.Sy dio_verify_wr zevent. Direct Write I/O checkum verify errors can be seen with .Nm zpool Cm status Fl d . diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 234612baea8..01f849845bd 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -98,7 +98,10 @@ This can be an indicator of problems with the underlying storage device. The number of delay events is ratelimited by the .Sy zfs_slow_io_events_per_second module parameter. -.It Sy dio_verify +.It Sy dio_verify_rd +Issued when there was a checksum verify error after a Direct I/O read has been +issued. +.It Sy dio_verify_wr Issued when there was a checksum verify error after a Direct I/O write has been issued. This event can only take place if the module parameter diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 868fc4414db..f44f3f5f838 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -82,14 +82,18 @@ Specify .Sy --json-pool-key-guid to set pool GUID as key for pool objects instead of pool names. .It Fl d -Display the number of Direct I/O write checksum verify errors that have occured -on a top-level VDEV. +Display the number of Direct I/O read/write checksum verify errors that have +occured on a top-level VDEV. See .Sx zfs_vdev_direct_write_verify in .Xr zfs 4 for details about the conditions that can cause Direct I/O write checksum verify failures to occur. +Direct I/O reads checksum verify errors can also occur if the contents of the +buffer are being manipulated after the I/O has been issued and is in flight. +In the case of Direct I/O read checksum verify errors, the I/O will be reissued +through the ARC. .It Fl D Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index f20dc5d8c32..ab4f49a4ec5 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -620,9 +620,16 @@ abd_borrow_buf_copy(abd_t *abd, size_t n) /* * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * no change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. + * not change the contents of the ABD. If you want any changes you made to + * buf to be copied back to abd, use abd_return_buf_copy() instead. If the + * ABD is not constructed from user pages from Direct I/O then an ASSERT + * checks to make sure the contents of the buffer have not changed since it was + * borrowed. We can not ASSERT the contents of the buffer have not changed if + * it is composed of user pages. While Direct I/O write pages are placed under + * write protection and can not be changed, this is not the case for Direct I/O + * reads. The pages of a Direct I/O read could be manipulated at any time. + * Checksum verifications in the ZIO pipeline check for this issue and handle + * it by returning an error on checksum verification failure. */ void abd_return_buf(abd_t *abd, void *buf, size_t n) @@ -632,8 +639,34 @@ abd_return_buf(abd_t *abd, void *buf, size_t n) #ifdef ZFS_DEBUG (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); #endif - if (abd_is_linear(abd)) { + if (abd_is_from_pages(abd)) { + if (!abd_is_linear_page(abd)) + zio_buf_free(buf, n); + } else if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); + } else if (abd_is_gang(abd)) { +#ifdef ZFS_DEBUG + /* + * We have to be careful with gang ABD's that we do not ASSERT + * for any ABD's that contain user pages from Direct I/O. See + * the comment above about Direct I/O read buffers possibly + * being manipulated. In order to handle this, we jsut iterate + * through the gang ABD and only verify ABD's that are not from + * user pages. + */ + void *cmp_buf = buf; + + for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + if (!abd_is_from_pages(cabd)) { + ASSERT0(abd_cmp_buf(cabd, cmp_buf, + cabd->abd_size)); + } + cmp_buf = (char *)cmp_buf + cabd->abd_size; + } +#endif + zio_buf_free(buf, n); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 03362b1ee86..303af48cf3a 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -1008,7 +1008,9 @@ abd_borrow_buf_copy(abd_t *abd, size_t n) * borrowed. We can not ASSERT that the contents of the buffer have not changed * if it is composed of user pages because the pages can not be placed under * write protection and the user could have possibly changed the contents in - * the pages at any time. + * the pages at any time. This is also an issue for Direct I/O reads. Checksum + * verifications in the ZIO pipeline check for this issue and handle it by + * returning an error on checksum verification failure. */ void abd_return_buf(abd_t *abd, void *buf, size_t n) diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c index 622323bbbd5..43bccea14a8 100644 --- a/module/zcommon/zfs_valstr.c +++ b/module/zcommon/zfs_valstr.c @@ -206,6 +206,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "PR", "PROBE" }, { '.', "TH", "TRYHARD" }, { '.', "OP", "OPTIONAL" }, + { '.', "RD", "DIO_READ" }, { '.', "DQ", "DONT_QUEUE" }, { '.', "DP", "DONT_PROPAGATE" }, { '.', "BY", "IO_BYPASS" }, diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 91a7fd8df46..ed96e7515bc 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -330,7 +330,7 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, */ zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL, &zb); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb); mutex_exit(&db->db_mtx); zfs_racct_read(spa, db->db.db_size, 1, flags); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 13bb33cc687..419c8ac5bb2 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1026,7 +1026,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, ASSERT3U(vdc->vdc_nparity, >, 0); - raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); + raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth, zio); rr->rr_bigcols = bc; rr->rr_firstdatacol = vdc->vdc_nparity; #ifdef ZFS_DEBUG diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index acb72569667..e3dba0257b2 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -34,6 +34,7 @@ #include #include #include +#include /* * An indirect vdev corresponds to a vdev that has been removed. Since @@ -1832,6 +1833,19 @@ vdev_indirect_io_done(zio_t *zio) zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); + /* + * Any Direct I/O read that has a checksum error must be treated as + * suspicious as the contents of the buffer could be getting + * manipulated while the I/O is taking place. The checksum verify error + * will be reported to the top-level VDEV. + */ + if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { + zio->io_error = ret; + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio_dio_chksum_verify_error_report(zio); + ret = 0; + } + if (ret == 0) { zio_checksum_verified(zio); return; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 102eacb0334..65a840bf972 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -764,6 +764,27 @@ vdev_mirror_io_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * Any Direct I/O read that has a checksum error must be treated as + * suspicious as the contents of the buffer could be getting + * manipulated while the I/O is taking place. The checksum verify error + * will be reported to the top-level Mirror VDEV. + * + * There will be no attampt at reading any additional data copies. If + * the buffer is still being manipulated while attempting to read from + * another child, there exists a possibly that the checksum could be + * verified as valid. However, the buffer contents could again get + * manipulated after verifying the checksum. This would lead to bad data + * being written out during self healing. + */ + if ((zio->io_flags & ZIO_FLAG_DIO_READ) && + (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + zio_dio_chksum_verify_error_report(zio); + zio->io_error = vdev_mirror_worst_error(mm); + ASSERT3U(zio->io_error, ==, ECKSUM); + return; + } + /* * If we don't have a good copy yet, keep trying other children. */ diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 15c8b8ca601..5e330626be2 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -433,7 +433,7 @@ const zio_vsd_ops_t vdev_raidz_vsd_ops = { }; raidz_row_t * -vdev_raidz_row_alloc(int cols) +vdev_raidz_row_alloc(int cols, zio_t *zio) { raidz_row_t *rr = kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); @@ -445,7 +445,17 @@ vdev_raidz_row_alloc(int cols) raidz_col_t *rc = &rr->rr_col[c]; rc->rc_shadow_devidx = INT_MAX; rc->rc_shadow_offset = UINT64_MAX; - rc->rc_allow_repair = 1; + /* + * We can not allow self healing to take place for Direct I/O + * reads. There is nothing that stops the buffer contents from + * being manipulated while the I/O is in flight. It is possible + * that the checksum could be verified on the buffer and then + * the contents of that buffer are manipulated afterwards. This + * could lead to bad data being written out during self + * healing. + */ + if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) + rc->rc_allow_repair = 1; } return (rr); } @@ -619,7 +629,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } ASSERT3U(acols, <=, scols); - rr = vdev_raidz_row_alloc(scols); + rr = vdev_raidz_row_alloc(scols, zio); rm->rm_row[0] = rr; rr->rr_cols = acols; rr->rr_bigcols = bc; @@ -765,7 +775,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, for (uint64_t row = 0; row < rows; row++) { boolean_t row_use_scratch = B_FALSE; - raidz_row_t *rr = vdev_raidz_row_alloc(cols); + raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); rm->rm_row[row] = rr; /* The starting RAIDZ (parent) vdev sector of the row. */ @@ -2633,6 +2643,20 @@ raidz_checksum_verify(zio_t *zio) raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); + /* + * Any Direct I/O read that has a checksum error must be treated as + * suspicious as the contents of the buffer could be getting + * manipulated while the I/O is taking place. The checksum verify error + * will be reported to the top-level RAIDZ VDEV. + */ + if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { + zio->io_error = ret; + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio_dio_chksum_verify_error_report(zio); + zio_checksum_verified(zio); + return (0); + } + if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; @@ -2776,6 +2800,11 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) (rc->rc_error == 0 || rc->rc_size == 0)) { continue; } + /* + * We do not allow self healing for Direct I/O reads. + * See comment in vdev_raid_row_alloc(). + */ + ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " "offset=%llx", @@ -2979,6 +3008,8 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) /* Check for success */ if (raidz_checksum_verify(zio) == 0) { + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + return (0); /* Reconstruction succeeded - report errors */ for (int i = 0; i < rm->rm_nrows; i++) { @@ -3379,7 +3410,6 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio) zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); @@ -3444,6 +3474,9 @@ vdev_raidz_io_done(zio_t *zio) } if (raidz_checksum_verify(zio) == 0) { + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + goto done; + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); @@ -3538,6 +3571,7 @@ vdev_raidz_io_done(zio_t *zio) } } } +done: if (rm->rm_lr != NULL) { zfs_rangelock_exit(rm->rm_lr); rm->rm_lr = NULL; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a96f508ffac..d5e0d2a2b35 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -303,6 +303,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) (void) cr; int error = 0; boolean_t frsync = B_FALSE; + boolean_t dio_checksum_failure = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) @@ -424,8 +425,26 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (error) { /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); + if (error == ECKSUM) { + /* + * If a Direct I/O read returned a checksum + * verify error, then it must be treated as + * suspicious. The contents of the buffer could + * have beeen manipulated while the I/O was in + * flight. In this case, the remainder of I/O + * request will just be reissued through the + * ARC. + */ + if (uio->uio_extflg & UIO_DIRECT) { + dio_checksum_failure = B_TRUE; + uio->uio_extflg &= ~UIO_DIRECT; + n += dio_remaining_resid; + dio_remaining_resid = 0; + continue; + } else { + error = SET_ERROR(EIO); + } + } #if defined(__linux__) /* @@ -472,6 +491,9 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) out: zfs_rangelock_exit(lr); + if (dio_checksum_failure == B_TRUE) + uio->uio_extflg |= UIO_DIRECT; + /* * Cleanup for Direct I/O if requested. */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index b26f5e80abf..a5daf73d59b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -804,11 +804,11 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { - ASSERT3U(*errorp, ==, EIO); - ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + /* + * Propogate the Direct I/O checksum verify failure to the parent. + */ + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; - } (*countp)--; @@ -1573,6 +1573,14 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; + /* + * We never allow the mirror VDEV to attempt reading from any + * additional data copies after the first Direct I/O checksum + * verify failure. This is to avoid bad data being written out + * through the mirror during self healing. See comment in + * vdev_mirror_io_done() for more details. + */ + ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); } else if (type == ZIO_TYPE_WRITE && pio->io_prop.zp_direct_write == B_TRUE) { /* @@ -4555,18 +4563,18 @@ zio_vdev_io_assess(zio_t *zio) } /* - * If a Direct I/O write checksum verify error has occurred then this - * I/O should not attempt to be issued again. Instead the EIO will - * be returned. + * If a Direct I/O operation has a checksum verify error then this I/O + * should not attempt to be issued again. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { - ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); - ASSERT3U(zio->io_error, ==, EIO); + if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); + ASSERT3U(zio->io_error, ==, EIO); + } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } - if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); @@ -4864,16 +4872,40 @@ zio_checksum_verify(zio_t *zio) ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } + ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ, + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)); + if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - (void) zfs_ereport_start_checksum(zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, - zio->io_offset, zio->io_size, &info); + if (zio->io_flags & ZIO_FLAG_DIO_READ) { + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio_t *pio = zio_unique_parent(zio); + /* + * Any Direct I/O read that has a checksum + * error must be treated as suspicous as the + * contents of the buffer could be getting + * manipulated while the I/O is taking place. + * + * The checksum verify error will only be + * reported here for disk and file VDEV's and + * will be reported on those that the failure + * occurred on. Other types of VDEV's report the + * verify failure in their own code paths. + */ + if (pio->io_child_type == ZIO_CHILD_LOGICAL) { + zio_dio_chksum_verify_error_report(zio); + } + } else { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + (void) zfs_ereport_start_checksum(zio->io_spa, + zio->io_vd, &zio->io_bookmark, zio, + zio->io_offset, zio->io_size, &info); + } } } @@ -4899,22 +4931,8 @@ zio_dio_checksum_verify(zio_t *zio) if ((error = zio_checksum_error(zio, NULL)) != 0) { zio->io_error = error; if (error == ECKSUM) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_dio_verify_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - zio->io_error = SET_ERROR(EIO); zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; - - /* - * The EIO error must be propagated up to the logical - * parent ZIO in zio_notify_parent() so it can be - * returned to dmu_write_abd(). - */ - zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; - - (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY, - zio->io_spa, zio->io_vd, &zio->io_bookmark, - zio, 0); + zio_dio_chksum_verify_error_report(zio); } } @@ -4932,6 +4950,39 @@ zio_checksum_verified(zio_t *zio) zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } +/* + * Report Direct I/O checksum verify error and create ZED event. + */ +void +zio_dio_chksum_verify_error_report(zio_t *zio) +{ + ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + + if (zio->io_child_type == ZIO_CHILD_LOGICAL) + return; + + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_dio_verify_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * Convert checksum error for writes into EIO. + */ + zio->io_error = SET_ERROR(EIO); + /* + * Report dio_verify_wr ZED event. + */ + (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + } else { + /* + * Report dio_verify_rd ZED event. + */ + (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD, + zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); + } +} + /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. @@ -5343,10 +5394,9 @@ zio_done(zio_t *zio) if (zio->io_reexecute) { /* - * A Direct I/O write that has a checksum verify error should - * not attempt to reexecute. Instead, EAGAIN should just be - * propagated back up so the write can be attempt to be issued - * through the ARC. + * A Direct I/O operation that has a checksum verify error + * should not attempt to reexecute. Instead, the error should + * just be propagated back. */ ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f89a4b3e0aa..fc4adc42d00 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -697,8 +697,8 @@ tags = ['functional', 'delegate'] tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', 'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block', 'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites', - 'dio_property', 'dio_random', 'dio_recordsize', 'dio_unaligned_block', - 'dio_unaligned_filesize'] + 'dio_property', 'dio_random', 'dio_read_verify', 'dio_recordsize', + 'dio_unaligned_block', 'dio_unaligned_filesize'] tags = ['functional', 'direct'] [tests/functional/exec] diff --git a/tests/zfs-tests/cmd/manipulate_user_buffer.c b/tests/zfs-tests/cmd/manipulate_user_buffer.c index 714f4220055..17358109444 100644 --- a/tests/zfs-tests/cmd/manipulate_user_buffer.c +++ b/tests/zfs-tests/cmd/manipulate_user_buffer.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2022 by Triad National Security, LLC. + * Copyright (c) 2024 by Triad National Security, LLC. */ #include @@ -39,51 +39,59 @@ #define MIN(a, b) ((a) < (b)) ? (a) : (b) #endif -static char *outputfile = NULL; +static char *filename = NULL; static int blocksize = 131072; /* 128K */ -static int wr_err_expected = 0; +static int err_expected = 0; +static int read_op = 0; +static int write_op = 0; static int numblocks = 100; static char *execname = NULL; static int print_usage = 0; static int randompattern = 0; -static int ofd; +static int fd; char *buf = NULL; typedef struct { - int entire_file_written; + int entire_file_completed; } pthread_args_t; static void usage(void) { (void) fprintf(stderr, - "usage %s -o outputfile [-b blocksize] [-e wr_error_expected]\n" - " [-n numblocks] [-p randpattern] [-h help]\n" + "usage %s -f filename [-b blocksize] [-e wr_error_expected]\n" + " [-n numblocks] [-p randompattern] -r read_op \n" + " -w write_op [-h help]\n" "\n" "Testing whether checksum verify works correctly for O_DIRECT.\n" "when manipulating the contents of a userspace buffer.\n" "\n" - " outputfile: File to write to.\n" - " blocksize: Size of each block to write (must be at \n" - " least >= 512).\n" - " wr_err_expected: Whether pwrite() is expected to return EIO\n" - " while manipulating the contents of the\n" - " buffer.\n" - " numblocks: Total number of blocksized blocks to\n" - " write.\n" - " randpattern: Fill data buffer with random data. Default\n" - " behavior is to fill the buffer with the \n" - " known data pattern (0xdeadbeef).\n" + " filename: File to read or write to.\n" + " blocksize: Size of each block to write (must be at \n" + " least >= 512).\n" + " err_expected: Whether write() is expected to return EIO\n" + " while manipulating the contents of the\n" + " buffer.\n" + " numblocks: Total number of blocksized blocks to\n" + " write.\n" + " read_op: Perform reads to the filename file while\n" + " while manipulating the buffer contents\n" + " write_op: Perform writes to the filename file while\n" + " manipulating the buffer contents\n" + " randompattern: Fill data buffer with random data for \n" + " writes. Default behavior is to fill the \n" + " buffer with known data pattern (0xdeadbeef)\n" " help: Print usage information and exit.\n" "\n" " Required parameters:\n" - " outputfile\n" + " filename\n" + " read_op or write_op\n" "\n" " Default Values:\n" " blocksize -> 131072\n" " wr_err_expexted -> false\n" " numblocks -> 100\n" - " randpattern -> false\n", + " randompattern -> false\n", execname); (void) exit(1); } @@ -97,16 +105,21 @@ parse_options(int argc, char *argv[]) extern int optind, optopt; execname = argv[0]; - while ((c = getopt(argc, argv, "b:ehn:o:p")) != -1) { + while ((c = getopt(argc, argv, "b:ef:hn:rw")) != -1) { switch (c) { case 'b': blocksize = atoi(optarg); break; case 'e': - wr_err_expected = 1; + err_expected = 1; break; + case 'f': + filename = optarg; + break; + + case 'h': print_usage = 1; break; @@ -115,12 +128,12 @@ parse_options(int argc, char *argv[]) numblocks = atoi(optarg); break; - case 'o': - outputfile = optarg; + case 'r': + read_op = 1; break; - case 'p': - randompattern = 1; + case 'w': + write_op = 1; break; case ':': @@ -141,7 +154,8 @@ parse_options(int argc, char *argv[]) if (errflag || print_usage == 1) (void) usage(); - if (blocksize < 512 || outputfile == NULL || numblocks <= 0) { + if (blocksize < 512 || filename == NULL || numblocks <= 0 || + (read_op == 0 && write_op == 0)) { (void) fprintf(stderr, "Required paramater(s) missing or invalid.\n"); (void) usage(); @@ -160,10 +174,10 @@ write_thread(void *arg) ssize_t wrote = 0; pthread_args_t *args = (pthread_args_t *)arg; - while (!args->entire_file_written) { - wrote = pwrite(ofd, buf, blocksize, offset); + while (!args->entire_file_completed) { + wrote = pwrite(fd, buf, blocksize, offset); if (wrote != blocksize) { - if (wr_err_expected) + if (err_expected) assert(errno == EIO); else exit(2); @@ -173,7 +187,35 @@ write_thread(void *arg) left -= blocksize; if (left == 0) - args->entire_file_written = 1; + args->entire_file_completed = 1; + } + + pthread_exit(NULL); +} + +/* + * Read blocksize * numblocks to the file using O_DIRECT. + */ +static void * +read_thread(void *arg) +{ + size_t offset = 0; + int total_data = blocksize * numblocks; + int left = total_data; + ssize_t read = 0; + pthread_args_t *args = (pthread_args_t *)arg; + + while (!args->entire_file_completed) { + read = pread(fd, buf, blocksize, offset); + if (read != blocksize) { + exit(2); + } + + offset = ((offset + blocksize) % total_data); + left -= blocksize; + + if (left == 0) + args->entire_file_completed = 1; } pthread_exit(NULL); @@ -189,7 +231,7 @@ manipulate_buf_thread(void *arg) char rand_char; pthread_args_t *args = (pthread_args_t *)arg; - while (!args->entire_file_written) { + while (!args->entire_file_completed) { rand_offset = (rand() % blocksize); rand_char = (rand() % (126 - 33) + 33); buf[rand_offset] = rand_char; @@ -202,9 +244,9 @@ int main(int argc, char *argv[]) { const char *datapattern = "0xdeadbeef"; - int ofd_flags = O_WRONLY | O_CREAT | O_DIRECT; + int fd_flags = O_DIRECT; mode_t mode = S_IRUSR | S_IWUSR; - pthread_t write_thr; + pthread_t io_thr; pthread_t manipul_thr; int left = blocksize; int offset = 0; @@ -213,9 +255,15 @@ main(int argc, char *argv[]) parse_options(argc, argv); - ofd = open(outputfile, ofd_flags, mode); - if (ofd == -1) { - (void) fprintf(stderr, "%s, %s\n", execname, outputfile); + if (write_op) { + fd_flags |= (O_WRONLY | O_CREAT); + } else { + fd_flags |= O_RDONLY; + } + + fd = open(filename, fd_flags, mode); + if (fd == -1) { + (void) fprintf(stderr, "%s, %s\n", execname, filename); perror("open"); exit(2); } @@ -228,24 +276,22 @@ main(int argc, char *argv[]) exit(2); } - if (!randompattern) { - /* Putting known data pattern in buffer */ - while (left) { - size_t amt = MIN(strlen(datapattern), left); - memcpy(&buf[offset], datapattern, amt); - offset += amt; - left -= amt; + if (write_op) { + if (!randompattern) { + /* Putting known data pattern in buffer */ + while (left) { + size_t amt = MIN(strlen(datapattern), left); + memcpy(&buf[offset], datapattern, amt); + offset += amt; + left -= amt; + } + } else { + /* Putting random data in buffer */ + for (int i = 0; i < blocksize; i++) + buf[i] = rand(); } - } else { - /* Putting random data in buffer */ - for (int i = 0; i < blocksize; i++) - buf[i] = rand(); } - /* - * Writing using O_DIRECT while manipulating the buffer contents until - * the entire file is written. - */ if ((rc = pthread_create(&manipul_thr, NULL, manipulate_buf_thread, &args))) { fprintf(stderr, "error: pthreads_create, manipul_thr, " @@ -253,18 +299,34 @@ main(int argc, char *argv[]) exit(2); } - if ((rc = pthread_create(&write_thr, NULL, write_thread, &args))) { - fprintf(stderr, "error: pthreads_create, write_thr, " - "rc: %d\n", rc); - exit(2); + if (write_op) { + /* + * Writing using O_DIRECT while manipulating the buffer contents + * until the entire file is written. + */ + if ((rc = pthread_create(&io_thr, NULL, write_thread, &args))) { + fprintf(stderr, "error: pthreads_create, io_thr, " + "rc: %d\n", rc); + exit(2); + } + } else { + /* + * Reading using O_DIRECT while manipulating the buffer contents + * until the entire file is read. + */ + if ((rc = pthread_create(&io_thr, NULL, read_thread, &args))) { + fprintf(stderr, "error: pthreads_create, io_thr, " + "rc: %d\n", rc); + exit(2); + } } - pthread_join(write_thr, NULL); + pthread_join(io_thr, NULL); pthread_join(manipul_thr, NULL); - assert(args.entire_file_written == 1); + assert(args.entire_file_completed == 1); - (void) close(ofd); + (void) close(fd); free(buf); diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 206ee8ac154..bc767b9f624 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1477,6 +1477,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/direct/dio_overwrites.ksh \ functional/direct/dio_property.ksh \ functional/direct/dio_random.ksh \ + functional/direct/dio_read_verify.ksh \ functional/direct/dio_recordsize.ksh \ functional/direct/dio_unaligned_block.ksh \ functional/direct/dio_unaligned_filesize.ksh \ diff --git a/tests/zfs-tests/tests/functional/direct/dio.kshlib b/tests/zfs-tests/tests/functional/direct/dio.kshlib index 3a70cf29396..5b3f893e1ce 100644 --- a/tests/zfs-tests/tests/functional/direct/dio.kshlib +++ b/tests/zfs-tests/tests/functional/direct/dio.kshlib @@ -84,8 +84,9 @@ function get_zpool_status_chksum_verify_failures # pool_name vdev_type function get_zed_dio_verify_events # pool { typeset pool=$1 + typeset op=$2 - val=$(zpool events $pool | grep -c dio_verify) + val=$(zpool events $pool | grep -c "dio_verify_${op}") echo "$val" } @@ -96,11 +97,12 @@ function get_zed_dio_verify_events # pool # zpool events # After getting that counts will clear the out the ZPool errors and events # -function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors +function check_dio_chksum_verify_failures # pool vdev_type op expect_errors { typeset pool=$1 typeset vdev_type=$2 typeset expect_errors=$3 + typeset op=$4 typeset note_str="expecting none" if [[ $expect_errors -ne 0 ]]; then @@ -108,10 +110,10 @@ function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors fi log_note "Checking for Direct I/O write checksum verify errors \ - $note_str on ZPool: $pool" + $note_str on ZPool: $pool with $vdev_type" status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type) - zed_dio_verify_events=$(get_zed_dio_verify_events $pool) + zed_dio_verify_events=$(get_zed_dio_verify_events $pool $op) if [[ $expect_errors -ne 0 ]]; then if [[ $status_failures -eq 0 || diff --git a/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh new file mode 100755 index 00000000000..456d429b1d9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify checksum verify works for Direct I/O reads. +# +# STRATEGY: +# 1. Create a zpool from each vdev type. +# 2. Start a Direct I/O read workload while manipulating the user buffer +# contents. +# 3. Verify there are Direct I/O read verify failures using +# zpool status -d and checking for zevents. We also make sure there +# are reported no data errors. +# + +verify_runnable "global" + +log_assert "Verify checksum verify works for Direct I/O reads." + +log_onexit dio_cleanup + +NUMBLOCKS=300 +BS=$((128 * 1024)) # 128k + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +# We will verify that there are no checksum errors for every Direct I/O read +# while manipulating the buffer contents while the I/O is still in flight and +# also that Direct I/O checksum verify failures and dio_verify_rd zevents are +# reported. + + +for type in "" "mirror" "raidz" "draid"; do + typeset vdev_type=$type + if [[ "${vdev_type}" == "" ]]; then + vdev_type="stripe" + fi + + log_note "Verifying every Direct I/O read verify with VDEV type \ + ${vdev_type}" + + create_pool $TESTPOOL1 $type $DIO_VDEVS + log_must eval "zfs create -o recordsize=128k -o compression=off \ + $TESTPOOL1/$TESTFS1" + + mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + prev_dio_rd=$(get_iostats_stat $TESTPOOL1 direct_read_count) + prev_arc_rd=$(get_iostats_stat $TESTPOOL1 arc_read_count) + + # Create the file before trying to manipulate the contents + log_must stride_dd -o "$mntpnt/direct-write.iso" -i /dev/urandom \ + -b $BS -c $NUMBLOCKS -D + # Manipulate the buffer contents will reading the file with Direct I/O + log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS -r + + # Getting new Direct I/O and ARC Write counts. + curr_dio_rd=$(get_iostats_stat $TESTPOOL1 direct_read_count) + curr_arc_rd=$(get_iostats_stat $TESTPOOL1 arc_read_count) + total_dio_rd=$((curr_dio_rd - prev_dio_rd)) + total_arc_rd=$((curr_arc_rd - prev_arc_rd)) + + log_note "Making sure there are no checksum errors with the ZPool" + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + log_note "Making sure we have Direct I/O and ARC reads logged" + if [[ $total_dio_rd -lt 1 ]]; then + log_fail "No Direct I/O reads $total_dio_rd" + fi + if [[ $total_arc_rd -lt 1 ]]; then + log_fail "No ARC reads $total_arc_rd" + fi + + log_note "Making sure we have Direct I/O write checksum verifies with ZPool" + check_dio_chksum_verify_failures "$TESTPOOL1" "$vdev_type" 1 "rd" + destroy_pool $TESTPOOL1 +done + +log_pass "Verified checksum verify works for Direct I/O reads." diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh index efc9ee63918..ccdabc678a6 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh @@ -46,7 +46,7 @@ verify_runnable "global" function cleanup { log_must rm -f "$mntpnt/direct-write.iso" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 + check_dio_chksum_verify_failures $TESTPOOL "raidz" 0 "wr" } log_assert "Verify stable pages work for Direct I/O writes." @@ -76,8 +76,8 @@ do # Manipulate the user's buffer while running O_DIRECT write # workload with the buffer. - log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS + log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS -w # Reading back the contents of the file log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh index 536459a35e6..4eb9efe95ef 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh @@ -91,8 +91,8 @@ log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 0 log_note "Verifying no panics for Direct I/O writes with compression" log_must zfs set compression=on $TESTPOOL/$TESTFS prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) -log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" -n $NUMBLOCKS \ - -b $BS +log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" -n $NUMBLOCKS \ + -b $BS -w curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) @@ -116,8 +116,8 @@ for i in $(seq 1 $ITERATIONS); do $i of $ITERATIONS with zfs_vdev_direct_write_verify=0" prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS + log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS -w # Reading file back to verify checksum errors filesize=$(get_file_size "$mntpnt/direct-write.iso") @@ -144,7 +144,7 @@ for i in $(seq 1 $ITERATIONS); do fi log_note "Making sure we have no Direct I/O write checksum verifies \ with ZPool" - check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 + check_dio_chksum_verify_failures $TESTPOOL "raidz" 0 "wr" log_must rm -f "$mntpnt/direct-write.iso" done @@ -166,8 +166,8 @@ for i in $(seq 1 $ITERATIONS); do $ITERATIONS with zfs_vdev_direct_write_verify=1" prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) - log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ - -n $NUMBLOCKS -b $BS -e + log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS -e -w # Reading file back to verify there no are checksum errors filesize=$(get_file_size "$mntpnt/direct-write.iso") @@ -175,7 +175,7 @@ for i in $(seq 1 $ITERATIONS); do log_must stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \ -c $num_blocks - # Getting new Direct I/O and ARC Write counts. + # Getting new Direct I/O write counts. curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) @@ -188,7 +188,7 @@ for i in $(seq 1 $ITERATIONS); do fi log_note "Making sure we have Direct I/O write checksum verifies with ZPool" - check_dio_write_chksum_verify_failures "$TESTPOOL" "raidz" 1 + check_dio_chksum_verify_failures "$TESTPOOL" "raidz" 1 "wr" done log_must rm -f "$mntpnt/direct-write.iso" From 7bf525530afdd71af14c9e740a3ed5af3876826a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 12 Oct 2024 03:37:57 +1100 Subject: [PATCH 15/32] zpool/zfs: allow --json wherever -j is allowed Mostly so that with the JSON formatting options are also used, they all look the same. To my eye, `-j --json-flat-vdevs` suggests that they are different or unrelated, while `--json --json-flat-vdevs` invites no further questions. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Umer Saleem Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Rob Norris Closes #16632 --- cmd/zfs/zfs_main.c | 28 ++++++++-- cmd/zpool/zpool_main.c | 10 +++- man/man8/zfs-list.8 | 2 +- man/man8/zfs-mount.8 | 2 +- man/man8/zfs-program.8 | 2 +- man/man8/zfs-set.8 | 2 +- man/man8/zpool-get.8 | 4 +- man/man8/zpool-list.8 | 2 +- man/man8/zpool-status.8 | 2 +- .../functional/cli_root/json/json_sanity.ksh | 51 +++++++++++-------- 10 files changed, 72 insertions(+), 33 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index f979fd189cc..97397726c70 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -2162,6 +2162,7 @@ zfs_do_get(int argc, char **argv) cb.cb_type = ZFS_TYPE_DATASET; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT}, {0, 0, 0, 0} }; @@ -3852,6 +3853,7 @@ zfs_do_list(int argc, char **argv) nvlist_t *data = NULL; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT}, {0, 0, 0, 0} }; @@ -7436,9 +7438,15 @@ share_mount(int op, int argc, char **argv) uint_t nthr; jsobj = data = item = NULL; + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":ajRlvo:Of" : "al")) - != -1) { + while ((c = getopt_long(argc, argv, + op == OP_MOUNT ? ":ajRlvo:Of" : "al", + op == OP_MOUNT ? long_options : NULL, NULL)) != -1) { switch (c) { case 'a': do_all = 1; @@ -8374,8 +8382,14 @@ zfs_do_channel_program(int argc, char **argv) boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "nt:m:j")) != -1) { + while ((c = getopt_long(argc, argv, "nt:m:j", long_options, + NULL)) != -1) { switch (c) { case 't': case 'm': { @@ -9083,7 +9097,13 @@ zfs_do_version(int argc, char **argv) int c; nvlist_t *jsobj = NULL, *zfs_ver = NULL; boolean_t json = B_FALSE; - while ((c = getopt(argc, argv, "j")) != -1) { + + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {0, 0, 0, 0} + }; + + while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) { switch (c) { case 'j': json = B_TRUE; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 09a6c604328..ea180f6b705 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7340,6 +7340,7 @@ zpool_do_list(int argc, char **argv) current_prop_type = ZFS_TYPE_POOL; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, @@ -10981,6 +10982,7 @@ zpool_do_status(int argc, char **argv) struct option long_options[] = { {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-flat-vdevs", no_argument, NULL, ZPOOL_OPTION_JSON_FLAT_VDEVS}, @@ -12589,6 +12591,7 @@ zpool_do_get(int argc, char **argv) current_prop_type = cb.cb_type; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, @@ -13503,7 +13506,12 @@ zpool_do_version(int argc, char **argv) int c; nvlist_t *jsobj = NULL, *zfs_ver = NULL; boolean_t json = B_FALSE; - while ((c = getopt(argc, argv, "j")) != -1) { + + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + }; + + while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) { switch (c) { case 'j': json = B_TRUE; diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8 index b49def08b72..0fa7e9cc261 100644 --- a/man/man8/zfs-list.8 +++ b/man/man8/zfs-list.8 @@ -71,7 +71,7 @@ The following fields are displayed: Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. -.It Fl j Op Ar --json-int +.It Fl j , -json Op Ar --json-int Print the output in JSON format. Specify .Sy --json-int diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 index 6116fbaab77..921e2499e31 100644 --- a/man/man8/zfs-mount.8 +++ b/man/man8/zfs-mount.8 @@ -59,7 +59,7 @@ .Xc Displays all ZFS file systems currently mounted. .Bl -tag -width "-j" -.It Fl j +.It Fl j , -json Displays all mounted file systems in JSON format. .El .It Xo diff --git a/man/man8/zfs-program.8 b/man/man8/zfs-program.8 index 928620362be..460cd2e11cf 100644 --- a/man/man8/zfs-program.8 +++ b/man/man8/zfs-program.8 @@ -50,7 +50,7 @@ and any attempts to access or modify other pools will cause an error. . .Sh OPTIONS .Bl -tag -width "-t" -.It Fl j +.It Fl j , -json Display channel program output in JSON format. When this flag is specified and standard output is empty - channel program encountered an error. diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 index 204450d72ec..f1718608b44 100644 --- a/man/man8/zfs-set.8 +++ b/man/man8/zfs-set.8 @@ -130,7 +130,7 @@ The value can be used to display all properties that apply to the given dataset's type .Pq Sy filesystem , volume , snapshot , No or Sy bookmark . .Bl -tag -width "-s source" -.It Fl j Op Ar --json-int +.It Fl j , -json Op Ar --json-int Display the output in JSON format. Specify .Sy --json-int diff --git a/man/man8/zpool-get.8 b/man/man8/zpool-get.8 index 5384906f17f..1be83526d22 100644 --- a/man/man8/zpool-get.8 +++ b/man/man8/zpool-get.8 @@ -98,7 +98,7 @@ See the .Xr zpoolprops 7 manual page for more information on the available pool properties. .Bl -tag -compact -offset Ds -width "-o field" -.It Fl j Op Ar --json-int, --json-pool-key-guid +.It Fl j , -json Op Ar --json-int, --json-pool-key-guid Display the list of properties in JSON format. Specify .Sy --json-int @@ -157,7 +157,7 @@ See the .Xr vdevprops 7 manual page for more information on the available pool properties. .Bl -tag -compact -offset Ds -width "-o field" -.It Fl j Op Ar --json-int +.It Fl j , -json Op Ar --json-int Display the list of properties in JSON format. Specify .Sy --json-int diff --git a/man/man8/zpool-list.8 b/man/man8/zpool-list.8 index b0ee659701d..6d3478a6871 100644 --- a/man/man8/zpool-list.8 +++ b/man/man8/zpool-list.8 @@ -59,7 +59,7 @@ is specified, the command exits after .Ar count reports are printed. .Bl -tag -width Ds -.It Fl j Op Ar --json-int, --json-pool-key-guid +.It Fl j , -json Op Ar --json-int, --json-pool-key-guid Display the list of pools in JSON format. Specify .Sy --json-int diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index f44f3f5f838..b9b54185d05 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -70,7 +70,7 @@ See the option of .Nm zpool Cm iostat for complete details. -.It Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid +.It Fl j , -json Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid Display the status for ZFS pools in JSON format. Specify .Sy --json-int diff --git a/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh b/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh index e598dd57181..d092a3b0e82 100755 --- a/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh @@ -30,28 +30,39 @@ # STRATEGY: # 1. Run different zfs/zpool -j commands and check for valid JSON +# +# -j and --json mean the same thing. Each command will be run twice, replacing +# JSONFLAG with the flag under test. list=( - "zpool status -j -g --json-int --json-flat-vdevs --json-pool-key-guid" - "zpool status -p -j -g --json-int --json-flat-vdevs --json-pool-key-guid" - "zpool status -j -c upath" - "zpool status -j" - "zpool status -j testpool1" - "zpool list -j" - "zpool list -j -g" - "zpool list -j -o fragmentation" - "zpool get -j size" - "zpool get -j all" - "zpool version -j" - "zfs list -j" - "zfs list -j testpool1" - "zfs get -j all" - "zfs get -j available" - "zfs mount -j" - "zfs version -j" + "zpool status JSONFLAG -g --json-int --json-flat-vdevs --json-pool-key-guid" + "zpool status -p JSONFLAG -g --json-int --json-flat-vdevs --json-pool-key-guid" + "zpool status JSONFLAG -c upath" + "zpool status JSONFLAG" + "zpool status JSONFLAG testpool1" + "zpool list JSONFLAG" + "zpool list JSONFLAG -g" + "zpool list JSONFLAG -o fragmentation" + "zpool get JSONFLAG size" + "zpool get JSONFLAG all" + "zpool version JSONFLAG" + "zfs list JSONFLAG" + "zfs list JSONFLAG testpool1" + "zfs get JSONFLAG all" + "zfs get JSONFLAG available" + "zfs mount JSONFLAG" + "zfs version JSONFLAG" ) -for cmd in "${list[@]}" ; do - log_must eval "$cmd | jq > /dev/null" -done +function run_json_tests +{ + typeset flag=$1 + for cmd in "${list[@]}" ; do + cmd=${cmd//JSONFLAG/$flag} + log_must eval "$cmd | jq > /dev/null" + done +} + +log_must run_json_tests -j +log_must run_json_tests --json log_pass "zpool and zfs commands outputted valid JSON" From 7e4be92750a324850474ab9b61f9d8321f9619e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Fri, 11 Oct 2024 18:55:17 +0200 Subject: [PATCH 16/32] zdb: fix printf format in dump_zap() When compiling zdb.c on 32-bit platforms, a format conversion error is reported for a printf() in dump_zap(). Change %l to macro %" PRIu64 " to match the platform size of a 64-bit unsigned integer. Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #16635 --- cmd/zdb/zdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 2c136bacb1a..16c7025802f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1131,7 +1131,7 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size) !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY); if (key64) - (void) printf("\t\t0x%010lx = ", + (void) printf("\t\t0x%010" PRIu64 "x = ", *(uint64_t *)attrp->za_name); else (void) printf("\t\t%s = ", attrp->za_name); From 34efa8e2d884cdb876a017d60b9737adaf467e5d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 11 Oct 2024 14:16:00 -0700 Subject: [PATCH 17/32] CI: Stick with ubuntu-22.04 for CodeQL analysis The ubuntu-latest alias now refers to ubuntu-24.04 instead of ubuntu-22.04 which causes CodeQL's autobuild to fail with: cpp/autobuilder: deptrace not supported in ubuntu 24.04 Until deptrace is supported by ubuntu-24.04 hosted runners request ubuntu-22.04 which is supported. Signed-off-by: Brian Behlendorf Reviewed-by: Tino Reichardt Reviewed-by: George Melikov Closes #16639 --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 2656a20fea0..e975d7dd00b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -11,7 +11,7 @@ concurrency: jobs: analyze: name: Analyze - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: actions: read contents: read From 9f3f80c0cc6215b9bb69690c28ad0a4125b21273 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 11 Oct 2024 14:22:24 -0700 Subject: [PATCH 18/32] ZTS: Slightly increase dedup_quota limit As described in the comment above this check the space used by logged entries is not accounted for and some margin needs to be added in. While uncommon we have slightly exceeded the 600,000 threshold on some CI run so we increase the limit a bit more. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #16637 --- tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index 326152b510a..b1657648b5a 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -221,7 +221,7 @@ function ddt_dedup_vdev_limit # For here, we just set the entry count a little higher than what we # expect to allow for some instability. # - log_must test $(ddt_entries) -le 600000 + log_must test $(ddt_entries) -le 650000 do_clean } From 97ba7c210c8fc3c4cd7d6687e07906ff59cf5327 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 11 Oct 2024 16:12:16 -0700 Subject: [PATCH 19/32] ZTS: Increase zpool_import_parallel_pos import margin Increase the pool import time allowed by assuming a minimum reduction to 1/2 instead of 1/3 when comparing sequential to parallel import times. This is sufficient to verify parallel imports are working as intended and should address the occasional false positive failure when the time is slightly exceeded. Reviewed-by: Tino Reichardt Reviewed-by: George Melikov Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #16638 --- .../cli_root/zpool_import/zpool_import_parallel_pos.ksh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh index 71b2437a37e..57a7412e37c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -113,7 +113,7 @@ wait parallel_time=$SECONDS log_note "asyncronously imported 4 pools in $parallel_time seconds" -log_must test $parallel_time -lt $(($sequential_time / 3)) +log_must test $parallel_time -lt $(($sequential_time / 2)) # # export pools with import delay injectors @@ -132,6 +132,6 @@ log_must zpool import -a -d $DEVICE_DIR -f parallel_time=$SECONDS log_note "asyncronously imported 4 pools in $parallel_time seconds" -log_must test $parallel_time -lt $(($sequential_time / 3)) +log_must test $parallel_time -lt $(($sequential_time / 2)) log_pass "Pool imports occur in parallel" From 48dfe3974710aa9807980852aa5f46e1c311c579 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 12 Oct 2024 13:48:56 -0700 Subject: [PATCH 20/32] Fallback to strerror() when strerror_l() isn't available Some C libraries, such as uClibc, do not provide strerror_l() in which case we fallback to strerror(). Reviewed-by: Tino Reichardt Signed-off-by: Brian Behlendorf Closes #16636 Closes #16640 --- config/user.m4 | 2 +- include/libzutil.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/user.m4 b/config/user.m4 index badd920d2b8..4e31745a2ab 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -33,7 +33,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strerror_l strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/include/libzutil.h b/include/libzutil.h index e2108ceeaa4..f8712340cc5 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -276,7 +276,11 @@ _LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, * Thread-safe strerror() for use in ZFS libraries */ static inline char *zfs_strerror(int errnum) { +#ifdef HAVE_STRERROR_L return (strerror_l(errnum, uselocale(0))); +#else + return (strerror(errnum)); +#endif } #ifdef __cplusplus From c642e985e596eb38d9ea9be2bc97ff0fd9c7d5a0 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 12 Oct 2024 13:51:35 -0700 Subject: [PATCH 21/32] Revert "Temporarily disable Direct IO by default" This partially reverts commit 41210597. Now that b4e4cbeb2 has been merged Direct IO can be enabled by default for Linux, but for FreeBSD there still remains a potentially insufficient range locking in zfs_getpages() which needs to be resolved. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #16629 --- module/zfs/zfs_vnops.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index d5e0d2a2b35..0799f17758b 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -77,8 +77,15 @@ static int zfs_bclone_wait_dirty = 0; * Enable Direct I/O. If this setting is 0, then all I/O requests will be * directed through the ARC acting as though the dataset property direct was * set to disabled. + * + * Disabled by default on FreeBSD until a potential range locking issue in + * zfs_getpages() can be resolved. */ +#ifdef __FreeBSD__ static int zfs_dio_enabled = 0; +#else +static int zfs_dio_enabled = 1; +#endif /* From e7b64159f86c606a6c454654ade60b8888162a70 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 12 Oct 2024 09:53:32 +0200 Subject: [PATCH 22/32] ZTS: Optimize Kernel Same-page Merging (KSM) Kernel same-page Merging (KSM) allows KVM guests to share identical memory pages. These shared pages are usually common libraries or other identical, high-use data. The current configuration was a bit to lazy - so KSM didn't work very well. With the new configuration I could run 3 Linux VMs in parralel. FreeBSD can't benefit from it. But FreeBSD is not so memory hungry in general, so there is no need for it ;) Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #16641 --- .github/workflows/scripts/qemu-1-setup.sh | 16 +++++++++------- .github/workflows/scripts/qemu-5-setup.sh | 12 +++++++----- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.github/workflows/scripts/qemu-1-setup.sh b/.github/workflows/scripts/qemu-1-setup.sh index ebd80a2f98c..f838da34eff 100755 --- a/.github/workflows/scripts/qemu-1-setup.sh +++ b/.github/workflows/scripts/qemu-1-setup.sh @@ -18,19 +18,21 @@ ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -q -N "" # we expect RAM shortage cat << EOF | sudo tee /etc/ksmtuned.conf > /dev/null +# /etc/ksmtuned.conf - Configuration file for ksmtuned # https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/chap-ksm KSM_MONITOR_INTERVAL=60 # Millisecond sleep between ksm scans for 16Gb server. # Smaller servers sleep more, bigger sleep less. -KSM_SLEEP_MSEC=10 -KSM_NPAGES_BOOST=300 -KSM_NPAGES_DECAY=-50 -KSM_NPAGES_MIN=64 -KSM_NPAGES_MAX=2048 +KSM_SLEEP_MSEC=30 -KSM_THRES_COEF=25 -KSM_THRES_CONST=2048 +KSM_NPAGES_BOOST=0 +KSM_NPAGES_DECAY=0 +KSM_NPAGES_MIN=1000 +KSM_NPAGES_MAX=25000 + +KSM_THRES_COEF=80 +KSM_THRES_CONST=8192 LOGFILE=/var/log/ksmtuned.log DEBUG=1 diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh index 7acb67a2792..5034c189630 100755 --- a/.github/workflows/scripts/qemu-5-setup.sh +++ b/.github/workflows/scripts/qemu-5-setup.sh @@ -14,17 +14,19 @@ PID=$(pidof /usr/bin/qemu-system-x86_64) tail --pid=$PID -f /dev/null sudo virsh undefine openzfs +# default values per test vm: +VMs=2 +CPU=2 + # definitions of per operating system case "$OS" in + # FreeBSD can't be optimized via ksmtuned freebsd*) - VMs=2 - CPU=3 RAM=6 ;; *) - VMs=2 - CPU=3 - RAM=7 + # Linux can be optimized via ksmtuned + RAM=8 ;; esac From e0bf43d64ed01285321bf6c3a308f699c5483efc Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 12 Oct 2024 09:54:50 +0200 Subject: [PATCH 23/32] ZTS: Make use of optimal CPU pinning With CPU pinning, we should get some speedup because of better cpu cache re-use. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #16641 --- .github/workflows/scripts/qemu-5-setup.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh index 5034c189630..bc40e8894b2 100755 --- a/.github/workflows/scripts/qemu-5-setup.sh +++ b/.github/workflows/scripts/qemu-5-setup.sh @@ -18,10 +18,12 @@ sudo virsh undefine openzfs VMs=2 CPU=2 -# definitions of per operating system +# cpu pinning +CPUSET=("0,1" "2,3") + case "$OS" in - # FreeBSD can't be optimized via ksmtuned freebsd*) + # FreeBSD can't be optimized via ksmtuned RAM=6 ;; *) @@ -75,6 +77,7 @@ EOF --cpu host-passthrough \ --virt-type=kvm --hvm \ --vcpus=$CPU,sockets=1 \ + --cpuset=${CPUSET[$((i-1))]} \ --memory $((1024*RAM)) \ --memballoon model=virtio \ --graphics none \ From 38a04f0a7c17885385187ce02aa7595aef6f3d4a Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 16 Oct 2024 11:00:40 -0600 Subject: [PATCH 24/32] freebsd: Use compiler.h from FreeBSD's base's linuxkpi The FreeBSD linux/compiler.h in OpenZFS was copied from a very old version of FreeBSD's linuxkpi's linux/compiler.h. There's no need for this duplication. Use FreeBSD's linuxkpi version instead, and provide zfs_fallthrough to augment it (it's all that's needed). Use #pragma once to avoid naming issues for guard variables. Since this is a complete rewrite, use my copyright here (the original code in FreeBSD still credits everybody). This works back at least to FreeBSD 12.4, which is not out of support, and all newer releases. Remove extra copies of macros that were defined elsewhere, but are now properly defined in LinuxKPI so are redundant. Sponsored-by: Netflix Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Warner Losh Closes #16650 --- include/os/freebsd/linux/compiler.h | 83 +++------------------------- include/os/freebsd/spl/sys/ccompat.h | 9 --- include/os/freebsd/spl/sys/debug.h | 4 -- 3 files changed, 8 insertions(+), 88 deletions(-) diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h index b408b77c746..24f09c72215 100644 --- a/include/os/freebsd/linux/compiler.h +++ b/include/os/freebsd/linux/compiler.h @@ -1,10 +1,5 @@ /* - * Copyright (c) 2010 Isilon Systems, Inc. - * Copyright (c) 2010 iXsystems, Inc. - * Copyright (c) 2010 Panasas, Inc. - * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. - * Copyright (c) 2015 François Tigeot - * All rights reserved. + * Copyright (c) 2024 Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,76 +21,14 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -#ifndef _LINUX_COMPILER_H_ -#define _LINUX_COMPILER_H_ -#include +/* + * FreeBSD's LinuxKPI compiler.h as far back as FreeBSD 12 has what we need, + * except zfs_fallthrough. + */ +#pragma once + +#include -#define __user -#define __kernel -#define __safe -#define __force -#define __nocast -#define __iomem -#define __chk_user_ptr(x) ((void)0) -#define __chk_io_ptr(x) ((void)0) -#define __builtin_warning(x, y...) (1) -#define __acquires(x) -#define __releases(x) -#define __acquire(x) do { } while (0) -#define __release(x) do { } while (0) -#define __cond_lock(x, c) (c) -#define __bitwise -#define __devinitdata -#define __deprecated -#define __init -#define __initconst -#define __devinit -#define __devexit -#define __exit -#define __rcu -#define __percpu -#define __weak __weak_symbol -#define __malloc -#define ___stringify(...) #__VA_ARGS__ -#define __stringify(...) ___stringify(__VA_ARGS__) -#define __attribute_const__ __attribute__((__const__)) -#undef __always_inline -#define __always_inline inline -#define noinline __noinline -#define ____cacheline_aligned __aligned(CACHE_LINE_SIZE) #define zfs_fallthrough __attribute__((__fallthrough__)) - -#if !defined(_KERNEL) && !defined(_STANDALONE) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) -#endif -#define typeof(x) __typeof(x) - -#define uninitialized_var(x) x = x -#define __maybe_unused __unused -#define __always_unused __unused -#define __must_check __result_use_check - -#define __printf(a, b) __printflike(a, b) - -#define barrier() __asm__ __volatile__("": : :"memory") -#define ___PASTE(a, b) a##b -#define __PASTE(a, b) ___PASTE(a, b) - -#define ACCESS_ONCE(x) (*(volatile __typeof(x) *)&(x)) - -#define WRITE_ONCE(x, v) do { \ - barrier(); \ - ACCESS_ONCE(x) = (v); \ - barrier(); \ -} while (0) - -#define lockless_dereference(p) READ_ONCE(p) - -#define _AT(T, X) ((T)(X)) - -#endif /* _LINUX_COMPILER_H_ */ diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h index 48749fb8eea..07b3515ad96 100644 --- a/include/os/freebsd/spl/sys/ccompat.h +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -70,15 +70,6 @@ hlist_del(struct hlist_node *n) n->next->pprev = n->pprev; } /* BEGIN CSTYLED */ -#define READ_ONCE(x) ({ \ - __typeof(x) __var = ({ \ - barrier(); \ - ACCESS_ONCE(x); \ - }); \ - barrier(); \ - __var; \ -}) - #define HLIST_HEAD_INIT { } #define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT #define INIT_HLIST_HEAD(head) (head)->first = NULL diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index f041dde34fc..9eb424dd037 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -95,10 +95,6 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #ifndef expect #define expect(expr, value) (__builtin_expect((expr), (value))) #endif -#ifndef __linux__ -#define likely(expr) expect((expr) != 0, 1) -#define unlikely(expr) expect((expr) != 0, 0) -#endif #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) From 27e8f5610262177567b9aaebc6c9448d783aadd7 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Thu, 17 Oct 2024 18:09:39 +0500 Subject: [PATCH 25/32] Fix inconsistent mount options for ZFS root While mounting ZFS root during boot on Linux distributions from initrd, mount from busybox is effectively used which executes mount system call directly. This skips the ZFS helper mount.zfs, which checks and enables the mount options as specified in dataset properties. As a result, datasets mounted during boot from initrd do not have correct mount options as specified in ZFS dataset properties. There has been an attempt to use mount.zfs in zfs initrd script, responsible for mounting the ZFS root filesystem (PR#13305). This was later reverted (PR#14908) after discovering that using mount.zfs breaks mounting of snapshots on root (/) and other child datasets of root have the same issue (Issue#9461). This happens because switching from busybox mount to mount.zfs correctly parses the mount options but also adds 'mntpoint=/root' to the mount options, which is then prepended to the snapshot mountpoint in '.zfs/snapshot'. '/root' is the directory on Debian with initramfs-tools where root filesystem is mounted before pivot_root. When Linux runtime is reached, trying to access the snapshots on root results in automounting the snapshot on '/root/.zfs/*', which fails. This commit attempts to fix the automounting of snapshots on root, while using mount.zfs in initrd script. Since the mountpoint of dataset is stored in vfs_mntpoint field, we can check if current mountpoint of dataset and vfs_mntpoint are same or not. If they are not same, reset the vfs_mntpoint field with current mountpoint. This fixes the mountpoints of root dataset and children in respective vfs_mntpoint fields when we try to access the snapshots of root dataset or its children. With correct mountpoint for root dataset and children stored in vfs_mntpoint, all snapshots of root dataset are mounted correctly and become accessible. This fix will come into play only if current process, that is trying to access the snapshots is not in chroot context. The Linux kernel API that is used to convert struct path into char format (d_path), returns the complete path for given struct path. It works in chroot environment as well and returns the correct path from original filesystem root. However d_path fails to return the complete path if any directory from original root filesystem is mounted using --bind flag or --rbind flag in chroot environment. In this case, if we try to access the snapshot from outside the chroot environment, d_path returns the path correctly, i.e. it returns the correct path to the directory that is mounted with --bind flag. However inside the chroot environment, it only returns the path inside chroot. For now, there is not a better way in my understanding that gives the complete path in char format and handles the case where directories from root filesystem are mounted with --bind or --rbind on another path which user will later chroot into. So this fix gets enabled if current process trying to access the snapshot is not in chroot context. With the snapshots issue fixed for root filesystem, using mount.zfs in ZFS initrd script, mounts the datasets with correct mount options. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #16646 --- contrib/initramfs/scripts/zfs | 5 +- include/os/linux/zfs/sys/zfs_vfsops_os.h | 1 + module/os/linux/zfs/zfs_ctldir.c | 109 +++++++++++++++++++++-- module/os/linux/zfs/zfs_vfsops.c | 6 +- 4 files changed, 111 insertions(+), 10 deletions(-) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 0a2bd2efda7..c569b252836 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount -o zfsutil -t zfs" + ZFS_CMD="mount.zfs -o zfsutil" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -359,9 +359,8 @@ mount_fs() # isn't the root fs. return 0 fi - # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount -t zfs" + ZFS_CMD="mount.zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 7067eb17900..30aa3a103d3 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -69,6 +69,7 @@ typedef struct vfs { boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; + kmutex_t vfs_mntpt_lock; } vfs_t; typedef struct zfs_mnt { diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 8a42a075cd2..f60d6ae91e0 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -767,9 +767,6 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, uint64_t id, pos = 0; int error = 0; - if (zfsvfs->z_vfs->vfs_mntpoint == NULL) - return (SET_ERROR(ENOENT)); - cookie = spl_fstrans_mark(); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -786,8 +783,14 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, break; } - snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", - zfsvfs->z_vfs->vfs_mntpoint, snapname); + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); + if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { + snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, snapname); + } else + error = SET_ERROR(ENOENT); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); + out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); @@ -1049,6 +1052,66 @@ exportfs_flush(void) (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); } +/* + * Returns the path in char format for given struct path. Uses + * d_path exported by kernel to convert struct path to char + * format. Returns the correct path for mountpoints and chroot + * environments. + * + * If chroot environment has directories that are mounted with + * --bind or --rbind flag, d_path returns the complete path inside + * chroot environment but does not return the absolute path, i.e. + * the path to chroot environment is missing. + */ +static int +get_root_path(struct path *path, char *buff, int len) +{ + char *path_buffer, *path_ptr; + int error = 0; + + path_get(path); + path_buffer = kmem_zalloc(len, KM_SLEEP); + path_ptr = d_path(path, path_buffer, len); + if (IS_ERR(path_ptr)) + error = SET_ERROR(-PTR_ERR(path_ptr)); + else + strcpy(buff, path_ptr); + + kmem_free(path_buffer, len); + path_put(path); + return (error); +} + +/* + * Returns if the current process root is chrooted or not. Linux + * kernel exposes the task_struct for current process and init. + * Since init process root points to actual root filesystem when + * Linux runtime is reached, we can compare the current process + * root with init process root to determine if root of the current + * process is different from init, which can reliably determine if + * current process is in chroot context or not. + */ +static int +is_current_chrooted(void) +{ + struct task_struct *curr = current, *global = &init_task; + struct path cr_root, gl_root; + + task_lock(curr); + get_fs_root(curr->fs, &cr_root); + task_unlock(curr); + + task_lock(global); + get_fs_root(global->fs, &gl_root); + task_unlock(global); + + int chrooted = !path_equal(&cr_root, &gl_root); + path_put(&gl_root); + path_put(&cr_root); + + return (chrooted); +} + /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a @@ -1123,14 +1186,50 @@ zfsctl_snapshot_mount(struct path *path, int flags) if (error) goto error; + if (is_current_chrooted() == 0) { + /* + * Current process is not in chroot context + */ + + char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + struct path mnt_path; + mnt_path.mnt = path->mnt; + mnt_path.dentry = path->mnt->mnt_root; + + /* + * Get path to current mountpoint + */ + error = get_root_path(&mnt_path, m, MAXPATHLEN); + if (error != 0) { + kmem_free(m, MAXPATHLEN); + goto error; + } + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); + if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { + /* + * If current mnountpoint and vfs_mntpoint are not same, + * store current mountpoint in vfs_mntpoint. + */ + if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) { + kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint); + zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); + } + } else + zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); + kmem_free(m, MAXPATHLEN); + } + /* * Construct a mount point path from sb of the ctldir inode and dirent * name, instead of from d_path(), so that chroot'd process doesn't fail * on mount.zfs(8). */ + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", dname(dentry)); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); snprintf(options, 7, "%s", zfs_snapshot_no_setuid ? "nosuid" : "suid"); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index de3e8c89cfd..3c53a8a315c 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -115,7 +115,7 @@ zfsvfs_vfs_free(vfs_t *vfsp) if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); - + mutex_destroy(&vfsp->vfs_mntpt_lock); kmem_free(vfsp, sizeof (vfs_t)); } } @@ -197,10 +197,11 @@ zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: + if (vfsp->vfs_mntpoint != NULL) + kmem_strfree(vfsp->vfs_mntpoint); vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); - break; default: break; @@ -219,6 +220,7 @@ zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); + mutex_init(&tmp_vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; From 0a001f30888f68f63078f9763c98ee727d96bf29 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 15 Oct 2024 22:18:19 +1100 Subject: [PATCH 26/32] libspl/backtrace: dump registers in libunwind backtraces More useful stuff, especially when trying to follow a disassembly. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16653 --- lib/libspl/backtrace.c | 44 +++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index d26d742106e..e969823c2e5 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -65,32 +65,58 @@ libspl_backtrace(int fd) ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; - unw_word_t loc; + unw_word_t v; char buf[128]; - size_t n; + size_t n, c; - ret = write(fd, "Call trace:\n", 12); unw_getcontext(&uc); + unw_init_local(&cp, &uc); + ret = write(fd, "Registers:\n", 11); + c = 0; + for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { + if (unw_get_reg(&cp, regnum, &v) < 0) + continue; + const char *name = unw_regname(regnum); + for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} + if (n == 0) { + buf[0] = '?'; + n = libspl_u64_to_hex_str(regnum, 2, + &buf[1], sizeof (buf)-1) + 1; + name = buf; + } + ret = write(fd, " ", 5-MIN(n, 3)); + ret = write(fd, name, n); + ret = write(fd, ": 0x", 4); + n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); + ret = write(fd, buf, n); + if (!(++c % 3)) + ret = write(fd, "\n", 1); + } + if (c % 3) + ret = write(fd, "\n", 1); + + unw_init_local(&cp, &uc); + ret = write(fd, "Call trace:\n", 12); while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &loc); + unw_get_reg(&cp, UNW_REG_IP, &v); ret = write(fd, " [0x", 5); - n = libspl_u64_to_hex_str(loc, 10, buf, sizeof (buf)); + n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); ret = write(fd, buf, n); ret = write(fd, "] ", 2); - unw_get_proc_name(&cp, buf, sizeof (buf), &loc); + unw_get_proc_name(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} ret = write(fd, buf, n); ret = write(fd, "+0x", 3); - n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); ret = write(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF ret = write(fd, " (in ", 5); - unw_get_elf_filename(&cp, buf, sizeof (buf), &loc); + unw_get_elf_filename(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} ret = write(fd, buf, n); ret = write(fd, " +0x", 4); - n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); ret = write(fd, buf, n); ret = write(fd, ")", 1); #endif From c7e47b3d9ac36269abeeffe100754311038779b6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 18 Oct 2024 13:49:41 +1100 Subject: [PATCH 27/32] libspl/backtrace: helper macros for output My eyes are going blurry looking at all those write calls. This is much nicer. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Close #16653 --- lib/libspl/backtrace.c | 58 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index e969823c2e5..b4568d124dd 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -25,12 +25,20 @@ #include #include +#include #include /* - * libspl_backtrace() must be safe to call from inside a signal hander. This - * mostly means it must not allocate, and so we can't use things like printf. + * Output helpers. libspl_backtrace() must not block, must be thread-safe and + * must be safe to call from a signal handler. At least, that means not having + * printf, so we end up having to call write() directly on the fd. That's + * awkward, as we always have to pass through a length, and some systems will + * complain if we don't consume the return. So we have some macros to make + * things a little more palatable. */ +#define spl_bt_write_n(fd, s, n) \ + do { ssize_t r __maybe_unused = write(fd, s, n); } while (0) +#define spl_bt_write(fd, s) spl_bt_write_n(fd, s, sizeof (s)) #if defined(HAVE_LIBUNWIND) #define UNW_LOCAL_ONLY @@ -62,7 +70,6 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) void libspl_backtrace(int fd) { - ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; unw_word_t v; @@ -72,7 +79,7 @@ libspl_backtrace(int fd) unw_getcontext(&uc); unw_init_local(&cp, &uc); - ret = write(fd, "Registers:\n", 11); + spl_bt_write(fd, "Registers:\n"); c = 0; for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { if (unw_get_reg(&cp, regnum, &v) < 0) @@ -85,42 +92,42 @@ libspl_backtrace(int fd) &buf[1], sizeof (buf)-1) + 1; name = buf; } - ret = write(fd, " ", 5-MIN(n, 3)); - ret = write(fd, name, n); - ret = write(fd, ": 0x", 4); + spl_bt_write_n(fd, " ", 5-MIN(n, 3)); + spl_bt_write_n(fd, name, n); + spl_bt_write(fd, ": 0x"); n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); - ret = write(fd, buf, n); + spl_bt_write_n(fd, buf, n); if (!(++c % 3)) - ret = write(fd, "\n", 1); + spl_bt_write(fd, "\n"); } if (c % 3) - ret = write(fd, "\n", 1); + spl_bt_write(fd, "\n"); unw_init_local(&cp, &uc); - ret = write(fd, "Call trace:\n", 12); + spl_bt_write(fd, "Call trace:\n"); while (unw_step(&cp) > 0) { unw_get_reg(&cp, UNW_REG_IP, &v); - ret = write(fd, " [0x", 5); + spl_bt_write(fd, " [0x"); n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); - ret = write(fd, buf, n); - ret = write(fd, "] ", 2); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, "] "); unw_get_proc_name(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - ret = write(fd, buf, n); - ret = write(fd, "+0x", 3); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, "+0x"); n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); - ret = write(fd, buf, n); + spl_bt_write_n(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF - ret = write(fd, " (in ", 5); + spl_bt_write(fd, " (in "); unw_get_elf_filename(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - ret = write(fd, buf, n); - ret = write(fd, " +0x", 4); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, " +0x"); n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); - ret = write(fd, buf, n); - ret = write(fd, ")", 1); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, ")"); #endif - ret = write(fd, "\n", 1); + spl_bt_write(fd, "\n"); } } #elif defined(HAVE_BACKTRACE) @@ -129,15 +136,12 @@ libspl_backtrace(int fd) void libspl_backtrace(int fd) { - ssize_t ret __attribute__((unused)); void *btptrs[64]; size_t nptrs = backtrace(btptrs, 64); - ret = write(fd, "Call trace:\n", 12); + spl_bt_write(fd, "Call trace:\n"); backtrace_symbols_fd(btptrs, nptrs, fd); } #else -#include - void libspl_backtrace(int fd __maybe_unused) { From 2596a75306980c7cb3de22fcf09e65dac2fbf279 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 18 Oct 2024 14:30:05 +1100 Subject: [PATCH 28/32] libspl/backtrace: rename and document hex conversion function Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16653 --- lib/libspl/backtrace.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index b4568d124dd..b541858cbc5 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -44,8 +44,13 @@ #define UNW_LOCAL_ONLY #include +/* + * Convert `v` to ASCII hex characters. The bottom `n` nybbles (4-bits ie one + * hex digit) will be written, up to `buflen`. The buffer will not be + * null-terminated. Returns the number of digits written. + */ static size_t -libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) +spl_bt_u64_to_hex_str(uint64_t v, size_t n, char *buf, size_t buflen) { static const char hexdigits[] = { '0', '1', '2', '3', '4', '5', '6', '7', @@ -53,10 +58,10 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) }; size_t pos = 0; - boolean_t want = (digits == 0); + boolean_t want = (n == 0); for (int i = 15; i >= 0; i--) { const uint64_t d = v >> (i * 4) & 0xf; - if (!want && (d != 0 || digits > i)) + if (!want && (d != 0 || n > i)) want = B_TRUE; if (want) { buf[pos++] = hexdigits[d]; @@ -88,14 +93,14 @@ libspl_backtrace(int fd) for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} if (n == 0) { buf[0] = '?'; - n = libspl_u64_to_hex_str(regnum, 2, + n = spl_bt_u64_to_hex_str(regnum, 2, &buf[1], sizeof (buf)-1) + 1; name = buf; } spl_bt_write_n(fd, " ", 5-MIN(n, 3)); spl_bt_write_n(fd, name, n); spl_bt_write(fd, ": 0x"); - n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); if (!(++c % 3)) spl_bt_write(fd, "\n"); @@ -108,14 +113,14 @@ libspl_backtrace(int fd) while (unw_step(&cp) > 0) { unw_get_reg(&cp, UNW_REG_IP, &v); spl_bt_write(fd, " [0x"); - n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); spl_bt_write(fd, "] "); unw_get_proc_name(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} spl_bt_write_n(fd, buf, n); spl_bt_write(fd, "+0x"); - n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF spl_bt_write(fd, " (in "); @@ -123,7 +128,7 @@ libspl_backtrace(int fd) for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} spl_bt_write_n(fd, buf, n); spl_bt_write(fd, " +0x"); - n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); spl_bt_write(fd, ")"); #endif From b85c5641611287095367ff9d5410cd74a896ab4f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 18 Oct 2024 15:10:33 +1100 Subject: [PATCH 29/32] libspl/backtrace: comment and harden libunwind backtracer This is the sort of code that we get right once and never look at again. Anyone reading this code is already likely in the middle of a debugging nightmare, and then they have a wall of manual string construction and an unfamiliar and idiosyncratic library to deal with. So, comment the whole thing to try to make it clear what's going on. In pursuit of the above, I've added return checks to some of the libunwind calls, fixed the frame loop to not skip the "top" frame (however unseful it may be), and fix a couple of calls to spl_bt_u64_to_hex_str() which requested 18 digits instead of 16. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16653 --- lib/libspl/backtrace.c | 166 ++++++++++++++++++++++++++++++++++------- 1 file changed, 141 insertions(+), 25 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index b541858cbc5..6e8b3b12122 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -79,61 +79,177 @@ libspl_backtrace(int fd) unw_cursor_t cp; unw_word_t v; char buf[128]; - size_t n, c; + size_t n; + int err; + /* Snapshot the current frame and state. */ unw_getcontext(&uc); - unw_init_local(&cp, &uc); + /* + * TODO: walk back to the frame that tripped the assertion / the place + * where the signal was recieved. + */ + + /* + * Register dump. We're going to loop over all the registers in the + * top frame, and show them, with names, in a nice three-column + * layout, which keeps us within 80 columns. + */ spl_bt_write(fd, "Registers:\n"); - c = 0; + + /* Initialise a frame cursor, starting at the current frame */ + unw_init_local(&cp, &uc); + + /* + * libunwind's list of possible registers for this architecture is an + * enum, unw_regnum_t. UNW_TDEP_LAST_REG is the highest-numbered + * register in that list, however, not all register numbers in this + * range are defined by the architecture, and not all defined registers + * will be present on every implementation of that architecture. + * Moreover, libunwind provides nice names for most, but not all + * registers, but these are hardcoded; a name being available does not + * mean that register is available. + * + * So, we have to pull this all together here. We try to get the value + * of every possible register. If we get a value for it, then the + * register must exist, and so we get its name. If libunwind has no + * name for it, we synthesize something. These cases should be rare, + * and they're usually for uninteresting or niche registers, so it + * shouldn't really matter. We can see the value, and that's the main + * thing. + */ + uint_t cols = 0; for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { + /* + * Get the value. Any error probably means the register + * doesn't exist, and we skip it. + */ if (unw_get_reg(&cp, regnum, &v) < 0) continue; + + /* + * Register name. If libunwind doesn't have a name for it, + * it will return "???". As a shortcut, we just treat '?' + * is an alternate end-of-string character. + */ const char *name = unw_regname(regnum); for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} if (n == 0) { + /* + * No valid name, so make one of the form "?xx", where + * "xx" is the two-char hex of libunwind's register + * number. + */ buf[0] = '?'; n = spl_bt_u64_to_hex_str(regnum, 2, &buf[1], sizeof (buf)-1) + 1; name = buf; } + + /* + * Two spaces of padding before each column, plus extra + * spaces to align register names shorter than three chars. + */ spl_bt_write_n(fd, " ", 5-MIN(n, 3)); + + /* Register name and column punctuation */ spl_bt_write_n(fd, name, n); spl_bt_write(fd, ": 0x"); - n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); + + /* + * Convert register value (from unw_get_reg()) to hex. We're + * assuming that all registers are 64-bits wide, which is + * probably fine for any general-purpose registers on any + * machine currently in use. A more generic way would be to + * look at the width of unw_word_t, but that would also + * complicate the column code a bit. This is fine. + */ + n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); - if (!(++c % 3)) + + /* Every third column, emit a newline */ + if (!(++cols % 3)) spl_bt_write(fd, "\n"); } - if (c % 3) + + /* If we finished before the third column, emit a newline. */ + if (cols % 3) spl_bt_write(fd, "\n"); - unw_init_local(&cp, &uc); + /* Now the main event, the backtrace. */ spl_bt_write(fd, "Call trace:\n"); - while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &v); + + /* Reset the cursor to the top again. */ + unw_init_local(&cp, &uc); + + do { + /* + * Getting the IP should never fail; libunwind handles it + * specially, because its used a lot internally. Still, no + * point being silly about it, as the last thing we want is + * our crash handler to crash. So if it ever does fail, we'll + * show an error line, but keep going to the next frame. + */ + if (unw_get_reg(&cp, UNW_REG_IP, &v) < 0) { + spl_bt_write(fd, " [couldn't get IP register; " + "corrupt frame?]"); + continue; + } + + /* IP & punctuation */ + n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf)); spl_bt_write(fd, " [0x"); - n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); spl_bt_write(fd, "] "); - unw_get_proc_name(&cp, buf, sizeof (buf), &v); - for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - spl_bt_write_n(fd, buf, n); - spl_bt_write(fd, "+0x"); - n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); - spl_bt_write_n(fd, buf, n); + + /* + * Function ("procedure") name for the current frame. `v` + * receives the offset from the named function to the IP, which + * we show as a "+offset" suffix. + * + * If libunwind can't determine the name, we just show "???" + * instead. We've already displayed the IP above; that will + * have to do. + * + * unw_get_proc_name() will return ENOMEM if the buffer is too + * small, instead truncating the name. So we treat that as a + * success and use whatever is in the buffer. + */ + err = unw_get_proc_name(&cp, buf, sizeof (buf), &v); + if (err == 0 || err == -UNW_ENOMEM) { + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + spl_bt_write_n(fd, buf, n); + + /* Offset from proc name */ + spl_bt_write(fd, "+0x"); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + } else + spl_bt_write(fd, "???"); + #ifdef HAVE_LIBUNWIND_ELF - spl_bt_write(fd, " (in "); - unw_get_elf_filename(&cp, buf, sizeof (buf), &v); - for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - spl_bt_write_n(fd, buf, n); - spl_bt_write(fd, " +0x"); - n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); - spl_bt_write_n(fd, buf, n); - spl_bt_write(fd, ")"); + /* + * Newer libunwind has unw_get_elf_filename(), which gets + * the name of the ELF object that the frame was executing in. + * Like `unw_get_proc_name()`, `v` recieves the offset within + * the file, and UNW_ENOMEM indicates that a truncate filename + * was left in the buffer. + */ + err = unw_get_elf_filename(&cp, buf, sizeof (buf), &v); + if (err == 0 || err == -UNW_ENOMEM) { + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + spl_bt_write(fd, " (in "); + spl_bt_write_n(fd, buf, n); + + /* Offset within file */ + spl_bt_write(fd, " +0x"); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, ")"); + } #endif spl_bt_write(fd, "\n"); - } + } while (unw_step(&cp) > 0); } #elif defined(HAVE_BACKTRACE) #include From fba6a90696eaa74dc19531995484218f7e900339 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 20 Oct 2024 12:39:05 -0400 Subject: [PATCH 30/32] zfs_debug: Restore log size limit for userspace For some reason it was dropped when split from kernel, that makes raidz_test to accumulate in RAM up to 100GB of logs we don't need. Reviewed-by: Brian Behlendorf Reviewed-by: Igor Kozhukhov Reviewed-by: Rob Norris Reviewed-by: Tino Reichardt Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16492 Closes #16566 Closes #16664 --- lib/libzpool/zfs_debug.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/libzpool/zfs_debug.c b/lib/libzpool/zfs_debug.c index df49a9a33fe..82c7229932f 100644 --- a/lib/libzpool/zfs_debug.c +++ b/lib/libzpool/zfs_debug.c @@ -35,9 +35,25 @@ typedef struct zfs_dbgmsg { static list_t zfs_dbgmsgs; static kmutex_t zfs_dbgmsgs_lock; +static uint_t zfs_dbgmsg_size = 0; +static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ int zfs_dbgmsg_enable = B_TRUE; +static void +zfs_dbgmsg_purge(uint_t max_size) +{ + while (zfs_dbgmsg_size > max_size) { + zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + uint_t size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + void zfs_dbgmsg_init(void) { @@ -74,6 +90,8 @@ __zfs_dbgmsg(char *buf) mutex_enter(&zfs_dbgmsgs_lock); list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(zfs_dbgmsg_maxsize); mutex_exit(&zfs_dbgmsgs_lock); } From a9851ea3dd6af6f789e221f2ccd7ad43561eff81 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Mon, 21 Oct 2024 01:43:16 +0900 Subject: [PATCH 31/32] Fix compile-time warnings caused by duplicate struct typedefs Some compiler/versions warn these typedefs according to #16660. The platform specific header sys/abd_os.h shouldn't define or use abd_t, as it's defined in its non-platform specific consumer sys/abd.h. Do the same as what FreeBSD header does. Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #16660 Closes #16665 --- include/os/linux/spl/sys/taskq.h | 3 +-- include/os/linux/zfs/sys/abd_os.h | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index f63a397f293..4cb3a055c3c 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -38,8 +38,7 @@ #include #include #include - -typedef struct kstat_s kstat_t; +#include #define TASKQ_NAMELEN 31 diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h index 606e8bf682e..3eed968e90c 100644 --- a/include/os/linux/zfs/sys/abd_os.h +++ b/include/os/linux/zfs/sys/abd_os.h @@ -30,6 +30,8 @@ extern "C" { #endif +struct abd; + struct abd_scatter { uint_t abd_offset; uint_t abd_nents; @@ -41,10 +43,8 @@ struct abd_linear { struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ }; -typedef struct abd abd_t; - typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); -int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, +int abd_iterate_page_func(struct abd *, size_t, size_t, abd_iter_page_func_t *, void *); /* @@ -52,11 +52,11 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, * Note: these are only needed to support vdev_classic. See comment in * vdev_disk.c. */ -unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +unsigned int abd_bio_map_off(struct bio *, struct abd *, unsigned int, size_t); +unsigned long abd_nr_pages_off(struct abd *, unsigned int, size_t); __attribute__((malloc)) -abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); +struct abd *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); #ifdef __cplusplus } From b2f6de7b58f81a4894ba26d87865a15e6115a1c2 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 21 Oct 2024 04:01:49 +1100 Subject: [PATCH 32/32] zdb: show bp in uberblock dump Just another useful nugget of info in times of strife. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16667 --- cmd/zdb/zdb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 16c7025802f..42b82c08249 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4266,6 +4266,10 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); + (void) printf("\tbp = %s\n", blkbuf); + (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) {