diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile index 0c997e38d37..0c2030faa3d 100644 --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -163,6 +163,8 @@ KERNEL_C = \ vdev_root.c \ vdev_trim.c \ zap.c \ + zap_fat.c \ + zap_impl.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ diff --git a/sys/conf/files b/sys/conf/files index 379685d8371..324ee35d490 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -346,6 +346,8 @@ contrib/openzfs/module/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_trim.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/zap.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zap_fat.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zap_impl.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/zcp.c optional zfs compile-with "${ZFS_C}" diff --git a/sys/contrib/openzfs/.github/workflows/README.md b/sys/contrib/openzfs/.github/workflows/README.md index eef47dae3dc..78774aac52f 100644 --- a/sys/contrib/openzfs/.github/workflows/README.md +++ b/sys/contrib/openzfs/.github/workflows/README.md @@ -1,61 +1,96 @@ -## The testings are done this way +## CI overview + +The main test pipeline is `zfs-qemu.yml`. Code checking and other +workflows run independently alongside it. ```mermaid flowchart TB -subgraph CleanUp and Summary - CleanUp+Summary +subgraph Functional testing + Setup[test-config: pick ci_type + OS matrix] + Setup --> almalinux + Setup --> centos[centos-stream] + Setup --> debian + Setup --> fedora + Setup --> ubuntu + Setup --> freebsd + almalinux --> Cleanup[cleanup + summary] + centos --> Cleanup + debian --> Cleanup + fedora --> Cleanup + ubuntu --> Cleanup + freebsd --> Cleanup end -subgraph Functional Testings - sanity-checks-20.04 - zloop-checks-20.04 - functional-testing-20.04-->Part1-20.04 - functional-testing-20.04-->Part2-20.04 - functional-testing-20.04-->Part3-20.04 - functional-testing-20.04-->Part4-20.04 - functional-testing-22.04-->Part1-22.04 - functional-testing-22.04-->Part2-22.04 - functional-testing-22.04-->Part3-22.04 - functional-testing-22.04-->Part4-22.04 - sanity-checks-22.04 - zloop-checks-22.04 -end - -subgraph Code Checking + Building - Build-Ubuntu-20.04 +subgraph Code checking + checkstyle.yaml codeql.yml - checkstyle.yml - Build-Ubuntu-22.04 + smatch.yml end - Build-Ubuntu-20.04-->sanity-checks-20.04 - Build-Ubuntu-20.04-->zloop-checks-20.04 - Build-Ubuntu-20.04-->functional-testing-20.04 - Build-Ubuntu-22.04-->sanity-checks-22.04 - Build-Ubuntu-22.04-->zloop-checks-22.04 - Build-Ubuntu-22.04-->functional-testing-22.04 - - sanity-checks-20.04-->CleanUp+Summary - Part1-20.04-->CleanUp+Summary - Part2-20.04-->CleanUp+Summary - Part3-20.04-->CleanUp+Summary - Part4-20.04-->CleanUp+Summary - Part1-22.04-->CleanUp+Summary - Part2-22.04-->CleanUp+Summary - Part3-22.04-->CleanUp+Summary - Part4-22.04-->CleanUp+Summary - sanity-checks-22.04-->CleanUp+Summary +subgraph Other workflows + zfs-arm.yml + zloop.yml + labels.yml +end ``` +Every `qemu-vm` matrix entry runs on a fixed `ubuntu-24.04` host. +The steps inside one entry are: -1) build zfs modules for Ubuntu 20.04 and 22.04 (~15m) -2) 2x zloop test (~10m) + 2x sanity test (~25m) -3) 4x functional testings in parts 1..4 (each ~1h) -4) cleanup and create summary - - content of summary depends on the results of the steps +1) set up QEMU and boot the guest (~2-4m) +2) install build dependencies in the guest (~2-4m) +3) build zfs modules in the guest (~8-12m) +4) run functional tests (~2-4h) +5) package and upload per-OS test logs (~10s) -When everything runs fine, the full run should be done in -about 2 hours. +A per-OS entry takes about 3 to 4 hours. Once all entries finish, the +`cleanup` job aggregates the results into a summary. -The codeql.yml and checkstyle.yml are not part in this circle. +### `ci_type` selection + +`test-config` runs `.github/workflows/scripts/generate-ci-type.py` against +the PR's changed files and picks one of: + +| `ci_type` | OS matrix | +|-----------|--------------------------------------------| +| `docs` | empty (documentation-only PRs) | +| `quick` | 6 Linux + 1 FreeBSD | +| `linux` | all supported Linux distros | +| `freebsd` | all supported FreeBSD versions | +| default | cross-platform sample | + +Pushes to `openzfs/zfs` skip the matrix entirely; only PRs (and pushes to +forks) build. + +Authors can force a specific ci_type by adding `ZFS-CI-Type: ` to +the most recent commit message. The `ZTS_OS_OVERRIDE` repository variable +can also alter the selection. The `workflow_dispatch` trigger accepts +`fedora_kernel_ver` (Fedora-only run with a chosen kernel) and +`specific_os` (pin the matrix to one OS). + +### Supported guests + +Auto-selected: + +- Linux: almalinux 8/9/10, centos-stream 9/10, debian 11/12/13, + fedora 43/44, ubuntu 22/24/26 +- FreeBSD: 14.4-RELEASE/STABLE, 15.0-RELEASE, 15.1-STABLE, 16.0-CURRENT + +Available via `specific_os` or `ZTS_OS_OVERRIDE`: + +- archlinux, tumbleweed + +### Code checking + +- `checkstyle.yaml`: source-style checks +- `codeql.yml`: CodeQL analysis +- `smatch.yml`: smatch analysis + +### Other workflows + +- `zfs-arm.yml`: ARM build on `ubuntu-24.04-arm` +- `zloop.yml`: host-side zloop +- `labels.yml`: maintains PR status labels +- `zfs-qemu-packages.yml`: manually dispatched, builds release RPMs or + tests RPM installation from the ZFS yum repo diff --git a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml index ddcc2b8581f..ce1e1fb8a46 100644 --- a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml +++ b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml @@ -12,7 +12,7 @@ jobs: checkstyle: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies diff --git a/sys/contrib/openzfs/.github/workflows/codeql.yml b/sys/contrib/openzfs/.github/workflows/codeql.yml index 689fe71fddc..fbaf53dc61e 100644 --- a/sys/contrib/openzfs/.github/workflows/codeql.yml +++ b/sys/contrib/openzfs/.github/workflows/codeql.yml @@ -11,7 +11,7 @@ concurrency: jobs: analyze: name: Analyze - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 permissions: actions: read contents: read @@ -31,15 +31,15 @@ jobs: uses: actions/checkout@v6 - name: Initialize CodeQL - uses: github/codeql-action/init@v3 + uses: github/codeql-action/init@v4 with: config-file: .github/codeql-${{ matrix.language }}.yml languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@v3 + uses: github/codeql-action/autobuild@v4 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 + uses: github/codeql-action/analyze@v4 with: category: "/language:${{matrix.language}}" diff --git a/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py b/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py index b1910ab630a..4862cc16139 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py +++ b/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py @@ -6,6 +6,9 @@ Output format: " " where source is "manual" (from ZFS-CI-Type commit tag) or "auto" (from file change heuristics). +Prints "docs auto" if every changed file is documentation; the qemu +matrix is skipped in that case. + Prints "quick manual" if: - the *last* commit message contains 'ZFS-CI-Type: quick' or "quick auto" if (heuristics): @@ -28,10 +31,24 @@ r'.*\.gitignore' ])) +""" +Patterns of files that are documentation only. +""" +DOCS_ONLY_REGEX = list(map(re.compile, [ + r'man/.*', + r'.*\.md', + r'AUTHORS', + r'COPYRIGHT', + r'LICENSE', + r'NOTICE', + r'\.gitignore', +])) + """ Patterns of files that are considered to trigger full CI. """ FULL_RUN_REGEX = list(map(re.compile, [ + r'\.github/workflows/.*\.ya?ml', r'\.github/workflows/scripts/.*', r'cmd.*', r'configs/.*', @@ -116,6 +133,12 @@ def output_type(type, source, reason): f'changed file "{f}" matches pattern "{r.pattern}"' ) + if changed_files and all( + any(r.match(f) for r in DOCS_ONLY_REGEX) + for f in changed_files): + output_type('docs', 'auto', + 'all changed files are documentation') + # catch-all output_type('quick', 'auto', 'no changed file matches full CI patterns') diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh index 5c41a4d6a49..2e83b441588 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh @@ -17,6 +17,8 @@ sudo docker builder prune -a unneeded="microsoft-edge-stable|azure-cli|google-cloud|google-chrome-stable|"\ "temurin|llvm|firefox|mysql-server|snapd|android|dotnet|haskell|ghcup|"\ "powershell|julia|swift|miniconda|chromium" +# refresh package index before removing packages +sudo apt-get -y update sudo apt-get -y remove $(dpkg-query -f '${binary:Package}\n' -W | grep -E "'$unneeded'") sudo apt-get -y autoremove diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh index e63aece389c..7e72030adb9 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh @@ -28,6 +28,7 @@ NIC="virtio" # additional options for virt-install OPTS[0]="" OPTS[1]="" +ALT_URL="" case "$OS" in almalinux8) @@ -56,11 +57,22 @@ case "$OS" in centos-stream9) OSNAME="CentOS Stream 9" URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2" + + # Sometimes we get HTTP errors for the first link. Fall back to the + # "Composes" repo as an alternative. The "Composes" repo includes + # autogenerated nightly CentOS Stream images. We have to lookup the URL + # dynamically since the qcow2 file name has the date in it. + ALT_URL=$(wget --accept "CentOS-Stream-GenericCloud-9-*.x86_64.qcow2" --spider -np --recursive --no-verbose \ + https://composes.stream.centos.org/stream-9/production/latest-CentOS-Stream/compose/BaseOS/x86_64/images/ 2>&1 | \ + awk '/200 OK/{print $(NF-2)}') ;; centos-stream10) OSNAME="CentOS Stream 10" OSv="centos-stream9" URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2" + ALT_URL=$(wget --accept "CentOS-Stream-GenericCloud-10-*.x86_64.qcow2" --spider -np --recursive --no-verbose \ + https://composes.stream.centos.org/stream-10/production/latest-CentOS-Stream/compose/BaseOS/x86_64/images/ 2>&1 | \ + awk '/200 OK/{print $(NF-2)}') ;; debian11) OSNAME="Debian 11" @@ -78,11 +90,6 @@ case "$OS" in OPTS[0]="--boot" OPTS[1]="uefi=on" ;; - fedora42) - OSNAME="Fedora 42" - OSv="fedora-unknown" - URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2" - ;; fedora43) OSNAME="Fedora 43" OSv="fedora-unknown" @@ -93,14 +100,6 @@ case "$OS" in OSv="fedora-unknown" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/44/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-44-1.7.x86_64.qcow2" ;; - freebsd13-5r) - FreeBSD="13.5-RELEASE" - OSNAME="FreeBSD $FreeBSD" - OSv="freebsd13.0" - URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" - KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" - NIC="rtl8139" - ;; freebsd14-4r) FreeBSD="14.4-RELEASE" OSNAME="FreeBSD $FreeBSD" @@ -111,18 +110,10 @@ case "$OS" in freebsd15-0r) FreeBSD="15.0-RELEASE" OSNAME="FreeBSD $FreeBSD" - OSv="freebsd15.0" + OSv="freebsd14.0" URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" ;; - freebsd13-5s) - FreeBSD="13.5-STABLE" - OSNAME="FreeBSD $FreeBSD" - OSv="freebsd13.0" - URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" - KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" - NIC="rtl8139" - ;; freebsd14-4s) FreeBSD="14.4-STABLE" OSNAME="FreeBSD $FreeBSD" @@ -131,7 +122,7 @@ case "$OS" in KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-1s) - FreeBSD="15.1-PRERELEASE" + FreeBSD="15.1-STABLE" OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" @@ -160,6 +151,11 @@ case "$OS" in OSv="ubuntu24.04" URL="$UBMIRROR/noble/current/noble-server-cloudimg-amd64.img" ;; + ubuntu26) + OSNAME="Ubuntu 26.04" + OSv="ubuntu24.04" + URL="$UBMIRROR/resolute/current/resolute-server-cloudimg-amd64.img" + ;; *) echo "Wrong value for OS variable!" exit 111 @@ -173,7 +169,6 @@ echo "ENV=$ENV" >> $ENV # result path echo 'RESPATH="/var/tmp/test_results"' >> $ENV -# FreeBSD 13 has problems with: e1000 and virtio echo "NIC=$NIC" >> $ENV # freebsd15 -> used in zfs-qemu.yml @@ -221,6 +216,16 @@ for cmd in 'axel -q -o' 'curl --fail -LSs -o' ; do if [ -s "$IMG" ] ; then # Successful download break + else + if [ -n "$ALT_URL" ] ; then + # Try the $ALT_URL if specified + echo "Loading alternative $ALT_URL with $cmd..." + time eval "$cmd $IMG $ALT_URL" + if [ -s "$IMG" ]; then + # Successful ALT_URL download + break + fi + fi fi done diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh index 6a83ef45fd2..d61e97cf423 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -215,7 +215,7 @@ case "$1" in tumbleweed) tumbleweed ;; - ubuntu*) + ubuntu22|ubuntu24) debian echo "##[group]Install Ubuntu specific" sudo apt-get install -yq linux-tools-common libtirpc-dev \ @@ -226,6 +226,27 @@ case "$1" in # https://github.com/actions/runner-images/issues/9946 sudo apt-get install -yq build-essential + echo "##[endgroup]" + echo "##[group]Delete Ubuntu OpenZFS modules" + for i in $(find /lib/modules -name zfs -type d); do sudo rm -rvf $i; done + echo "##[endgroup]" + ;; + ubuntu26) + debian + echo "##[group]Install Ubuntu specific" + # Skip linux-modules-extra which is already installed + sudo apt-get install -yq linux-tools-common + sudo apt-get install -yq libtirpc-dev + sudo apt-get install -yq dh-sequence-dkms + + # Need 'build-essential' explicitly for ARM builder + # https://github.com/actions/runner-images/issues/9946 + sudo apt-get install -yq build-essential + + # Replace sudo-rs with sudo for now because the Rust version + # does not support -E to preserve the entire environment + sudo update-alternatives --set sudo /usr/bin/sudo.ws + echo "##[endgroup]" echo "##[group]Delete Ubuntu OpenZFS modules" for i in $(find /lib/modules -name zfs -type d); do sudo rm -rvf $i; done @@ -267,8 +288,19 @@ case "$1" in ;; debian*|ubuntu*) sudo -E systemctl enable nfs-kernel-server - sudo -E systemctl enable qemu-guest-agent sudo -E systemctl enable smbd + + # enable usershares (disabled by default on ubuntu 26.04) + sudo -E sed -i '/usershare max shares/s/^#//' /etc/samba/smb.conf + + # add systemd drop-in to allow the service to be enabled + sudo -E mkdir -p /etc/systemd/system/qemu-guest-agent.service.d/ + sudo -E tee /etc/systemd/system/qemu-guest-agent.service.d/override.conf </dev/null ;; - ubuntu24) + ubuntu24|ubuntu26) GRUB_CFG="/boot/grub/grub.cfg" GRUB_MKCONFIG="grub-mkconfig" echo 'GRUB_DISABLE_OS_PROBER="false"' \ diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps.sh index 267ae4ad3c7..6e8dd6d7546 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps.sh @@ -2,9 +2,12 @@ # 3) Wait for VM to boot from previous step and launch dependencies # script on it. # -# $1: OS name (like 'fedora41') -# $2: (optional) Experimental kernel version to install on fedora, -# like "6.14". +# qemu-3-deps.sh [--poweroff] OS_NAME [FEDORA_VERSION] +# +# --poweroff: Power off the VM after installing dependencies +# OS_NAME: OS name (like 'fedora41') +# FEDORA_VERSION: (optional) Experimental Fedora kernel version, like "6.14" to +# install instead of Fedora defaults. ###################################################################### .github/workflows/scripts/qemu-wait-for-vm.sh vm0 @@ -15,8 +18,13 @@ # we need to update the kernel version in zfs's META file to allow the # build to happen. We update our local copy of META here, since we know # it will be rsync'd up in the next step. -if [ -n "${2:-}" ] ; then - sed -i -E 's/Linux-Maximum: .+/Linux-Maximum: 99.99/g' META +# +# Look to see if the last argument looks like a kernel version. +ver="${@: -1}" +if [[ $ver =~ ^[0-9]+\.[0-9]+ ]] ; then + # We got a kernel version, update META to say we support it so we + # can test against it. + sed -i -E 's/Linux-Maximum: .+/Linux-Maximum: '$ver'/g' META fi scp .github/workflows/scripts/qemu-3-deps-vm.sh zfs@vm0:qemu-3-deps-vm.sh diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh index bbfa2ec85b8..dfe70c4f1ef 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh @@ -5,10 +5,12 @@ # # Usage: # -# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM] -# [--poweroff][--release][--repo][--tarball] +# qemu-4-build-vm.sh OS [--custom-branch BRANCH][--enable-debug][--dkms] +# [--patch-level NUM][--poweroff][--release][--repo][--tarball] # # OS: OS name like 'fedora41' +# --custom-branch: When building packages, checkout this version of ZFS to +# build, but use the current CI scripts to do it. # --enable-debug: Build RPMs with '--enable-debug' (for testing) # --dkms: Build DKMS RPMs as well # --patch-level NUM: Use a custom patch level number for packages. @@ -27,8 +29,27 @@ POWEROFF="" RELEASE="" REPO="" TARBALL="" +CUSTOM_BRANCH="" +PREV_BRANCH="" + +cleanup() { + if [ -n "$PREV_BRANCH" ] ; then + git checkout $PREV_BRANCH + fi +} + while [[ $# -gt 0 ]]; do case $1 in + --custom-branch) + CUSTOM_BRANCH="$2" + # If the user specifies a custom tag/branch to build, and the build + # fails, we want to make sure our workflow scripts are restored to the + # current (more modern) versions so the subsequent CI steps use those. + shift + shift + PREV_BRANCH=$(git branch --show-current) + trap 'cleanup' ERR + ;; --enable-debug) ENABLE_DEBUG=1 shift @@ -337,7 +358,7 @@ fi # # rhel8.10 # almalinux9.5 -# fedora42 +# fedora44 source /etc/os-release if which hostnamectl &> /dev/null ; then # Fedora 42+ use hostnamectl @@ -367,6 +388,11 @@ if [ -n "$ENABLE_DEBUG" ] ; then extra="--enable-debug" fi +if [ -n "$CUSTOM_BRANCH" ] ; then + git fetch --unshallow + git checkout $CUSTOM_BRANCH +fi + # build case "$OS" in freebsd*) @@ -393,6 +419,8 @@ case "$OS" in ;; esac +git checkout $PREV_BRANCH +PREV_BRANCH="" # building the zfs module was ok echo 0 > /var/tmp/build-exitcode.txt diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh index ff3f0a356bb..62917f4cb72 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh @@ -25,8 +25,14 @@ cd lustre-release # Include Lustre patches to build against master/zfs-2.4.x. Once these # patches are merged we can remove these lines. +# +# LU-19539 osd-zfs: use osd_dmu_write() wrapper for xattrs +# LU-19761 osd-zfs: Build against ZFS 2.4.0 +# LU-19249 build: Compatibility updates for kernel v6.16 +# patches=('https://review.whamcloud.com/changes/fs%2Flustre-release~62101/revisions/2/patch?download' - 'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download') + 'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download' + 'https://review.whamcloud.com/changes/fs%2Flustre-release~60619/revisions/13/patch?download') for p in "${patches[@]}" ; do curl $p | base64 -d > patch diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh index c261cbfca06..a0612e5e0b2 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh @@ -79,6 +79,7 @@ function do_builtin_build() { cd $HOME/linux-$fullver ./scripts/config --enable ZFS + ./scripts/config --enable ZFS_DEBUG yes "" | make oldconfig make -j `nproc` ) &> /var/tmp/builtin.txt || rc=$? @@ -185,6 +186,13 @@ case "$OS" in sudo mount -o noatime /dev/vdb /var/tmp sudo chmod 1777 /var/tmp sudo mv -f /tmp/*.txt /var/tmp + + # Allow for longer RCU timeouts due to the heavily virtualized and + # potentially oversubscribed nature of the CI environment. + rcu_cpu_stall_timeout="/sys/module/rcupdate/parameters/rcu_cpu_stall_timeout" + if test -f $rcu_cpu_stall_timeout; then + echo 120 | sudo sh -c "cat > '$rcu_cpu_stall_timeout'" + fi ;; esac diff --git a/sys/contrib/openzfs/.github/workflows/smatch.yml b/sys/contrib/openzfs/.github/workflows/smatch.yml index 305a1f0179b..b6f47d8d41a 100644 --- a/sys/contrib/openzfs/.github/workflows/smatch.yml +++ b/sys/contrib/openzfs/.github/workflows/smatch.yml @@ -3,6 +3,14 @@ name: smatch on: push: pull_request: + paths-ignore: + - 'man/**' + - '**.md' + - 'AUTHORS' + - 'COPYRIGHT' + - 'LICENSE' + - 'NOTICE' + - '.gitignore' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -10,6 +18,7 @@ concurrency: jobs: smatch: + if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs' runs-on: ubuntu-24.04 steps: - name: Checkout smatch diff --git a/sys/contrib/openzfs/.github/workflows/zfs-arm.yml b/sys/contrib/openzfs/.github/workflows/zfs-arm.yml index 6039e4736c4..84e1272f713 100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-arm.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-arm.yml @@ -3,11 +3,30 @@ name: zfs-arm on: push: pull_request: + paths-ignore: + - 'man/**' + - '**.md' + - 'AUTHORS' + - 'COPYRIGHT' + - 'LICENSE' + - 'NOTICE' + - '.gitignore' workflow_dispatch: + inputs: + gcc_ver: + type: string + required: false + default: "" + description: "(optional) install specific GCC version, like '16'" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true jobs: zfs-arm: name: ZFS ARM build + if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs' runs-on: ubuntu-24.04-arm steps: - uses: actions/checkout@v6 @@ -18,6 +37,31 @@ jobs: timeout-minutes: 20 run: | sudo apt-get -y remove firefox || true + + # Do we want to test with a custom GCC version? + if [ "${{ github.event.inputs.gcc_ver }}" != "" ] ; then + ver="${{ github.event.inputs.gcc_ver }}" + + sudo add-apt-repository ppa:ubuntu-toolchain-r/test + sudo apt-get update + + echo "GCCs available:" + awk '/Package: gcc-/{print $2}' /var/lib/apt/lists/*ubuntu-toolchain-r*Packages + + sudo apt-get -y install gcc g++ gcc-$ver g++-$ver + + sudo update-alternatives --remove-all gcc || true 2>&1 + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-$ver 100 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-$ver 100 + sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 + sudo update-alternatives --set cc /usr/bin/gcc + sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 + sudo update-alternatives --set c++ /usr/bin/g++ + + sudo update-alternatives --set gcc "/usr/bin/gcc-$ver" + sudo update-alternatives --set g++ "/usr/bin/g++-$ver" + fi + .github/workflows/scripts/qemu-3-deps-vm.sh ubuntu24 # We're running the VM scripts locally on the runner, so need to fix @@ -28,7 +72,12 @@ jobs: - name: Build modules timeout-minutes: 30 run: | - .github/workflows/scripts/qemu-4-build-vm.sh --enable-debug ubuntu24 + # Even though we may have installed a newer GCC, the kernel builds don't + # seem to honor it, and instead use the older GCC. I assume this is + # to match up with whatever GCC version was used for the kernel. Always + # specify KERNEL_CC to get around this. This works when using the + # default GCC and with a custom GCC. + KERNEL_CC=/usr/bin/gcc .github/workflows/scripts/qemu-4-build-vm.sh --enable-debug ubuntu24 # Quick sanity test since we're not running the full ZTS sudo modprobe zfs diff --git a/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml b/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml index c3a7397c6ae..e3333086e62 100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml @@ -42,6 +42,11 @@ on: required: false default: "" description: "(optional) repo URL (blank: use http://download.zfsonlinux.org)" + custom_branch: + type: string + required: false + default: "" + description: "(optional) custom tag/branch to build using current CI (like 'zfs-2.2.9')" lookup: type: boolean required: false @@ -58,7 +63,7 @@ jobs: strategy: fail-fast: false matrix: - os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora42', 'fedora43', 'fedora44'] + os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora43', 'fedora44'] runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v6 @@ -94,9 +99,16 @@ jobs: if [ -n "${{ github.event.inputs.patch_level }}" ] ; then EXTRA="--patch-level ${{ github.event.inputs.patch_level }}" fi + if [ -n "${{ github.event.inputs.custom_branch }}" ] ; then + EXTRA+=" --custom-branch ${{ github.event.inputs.custom_branch }}" + fi .github/workflows/scripts/qemu-4-build.sh $EXTRA \ --repo --release --dkms --tarball ${{ matrix.os }} + + if [ -n "${{ github.event.inputs.custom_branch }}" ] ; then + echo "Built packages for ${{ github.event.inputs.custom_branch }}" + fi fi - name: Prepare artifacts diff --git a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml index 4b4fd27543f..64ffee484a5 100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml @@ -14,7 +14,7 @@ on: type: string required: false default: "" - description: "(optional) Only run on this specific OS (like 'fedora42' or 'alpine3-23')" + description: "(optional) Only run on this specific OS (like 'fedora44' or 'alpine3-23')" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -23,6 +23,7 @@ concurrency: jobs: test-config: name: Setup + if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs' runs-on: ubuntu-24.04 outputs: test_os: ${{ steps.os.outputs.os }} @@ -45,24 +46,27 @@ jobs: fi case "$ci_type" in + docs) + os_selection='[]' + ;; quick) - os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd15-1s", "ubuntu24"]' + os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora44", "freebsd15-1s", "ubuntu26"]' ;; linux) - os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora42", "fedora43", "fedora44", "ubuntu22", "ubuntu24"]' + os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24", "ubuntu26"]' ;; freebsd) - os_selection='["freebsd13-5r", "freebsd14-4r", "freebsd13-5s", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]' + os_selection='["freebsd14-4r", "freebsd14-4s", "freebsd15-0r", "freebsd15-1s", "freebsd16-0c"]' ;; *) # default list - os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora42", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24"]' + os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-0r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24", "ubuntu26"]' ;; esac # Repository-level override for OS selection. # Set vars.ZTS_OS_OVERRIDE in repo settings to restrict targets - # (e.g. '["debian13"]' or '["debian13", "fedora42"]'). + # (e.g. '["debian13"]' or '["debian13", "fedora44"]'). # Manual ZFS-CI-Type in commit messages bypasses the override. if [ -n "${{ vars.ZTS_OS_OVERRIDE }}" ] && [ "$ci_source" != "manual" ]; then override='${{ vars.ZTS_OS_OVERRIDE }}' @@ -91,15 +95,19 @@ jobs: qemu-vm: name: qemu-x86 needs: [ test-config ] + if: >- + (github.event_name == 'pull_request' || + github.repository != 'openzfs/zfs') && + needs.test-config.outputs.ci_type != 'docs' strategy: fail-fast: false matrix: # rhl: almalinux8, almalinux9, centos-streamX, fedora4x - # debian: debian12, debian13, ubuntu22, ubuntu24 + # debian: debian12, debian13, ubuntu22, ubuntu24, ubuntu26 # misc: archlinux, tumbleweed # FreeBSD variants of november 2025: - # FreeBSD Release: freebsd13-5r, freebsd14-4r, freebsd15-0r - # FreeBSD Stable: freebsd13-5s, freebsd14-4s, freebsd15-1s + # FreeBSD Release: freebsd14-4r, freebsd15-0r + # FreeBSD Stable: freebsd14-4s, freebsd15-1s # FreeBSD Current: freebsd16-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 @@ -153,7 +161,10 @@ jobs: run: .github/workflows/scripts/qemu-8-summary.sh '${{ steps.artifact-upload.outputs.artifact-url }}' cleanup: - if: always() + if: >- + (github.event_name == 'pull_request' || + github.repository != 'openzfs/zfs') && + always() name: Cleanup runs-on: ubuntu-latest needs: [ qemu-vm ] diff --git a/sys/contrib/openzfs/.github/workflows/zloop.yml b/sys/contrib/openzfs/.github/workflows/zloop.yml index 7f76a670af9..edd2c391583 100644 --- a/sys/contrib/openzfs/.github/workflows/zloop.yml +++ b/sys/contrib/openzfs/.github/workflows/zloop.yml @@ -3,6 +3,14 @@ name: zloop on: push: pull_request: + paths-ignore: + - 'man/**' + - '**.md' + - 'AUTHORS' + - 'COPYRIGHT' + - 'LICENSE' + - 'NOTICE' + - '.gitignore' concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/sys/contrib/openzfs/Makefile.am b/sys/contrib/openzfs/Makefile.am index 73382f86e6f..c1638aa4288 100644 --- a/sys/contrib/openzfs/Makefile.am +++ b/sys/contrib/openzfs/Makefile.am @@ -138,6 +138,7 @@ cstyle: ! -path './include/sys/lua/*' \ ! -path './module/lua/l*.[ch]' \ ! -path './module/zfs/lz4.c' \ + ! -path './tests/unit/munit.[ch]' \ $(cstyle_line) filter_executable = -exec test -x '{}' \; -print diff --git a/sys/contrib/openzfs/README.md b/sys/contrib/openzfs/README.md index fa348a24383..6a77cedb635 100644 --- a/sys/contrib/openzfs/README.md +++ b/sys/contrib/openzfs/README.md @@ -52,7 +52,7 @@ All RHEL (and compatible systems: AlmaLinux OS, Rocky Linux, etc) on the **full* All Ubuntu **LTS** releases are supported. -**Supported Ubuntu releases**: **24.04 “Noble”**, **22.04 “Jammy”**. +**Supported Ubuntu releases**: **26.04 “Resolute”**, **24.04 “Noble”**, **22.04 “Jammy”**. ### Debian @@ -68,4 +68,4 @@ Generally, if a distribution is following an LTS kernel, it should work well wit All FreeBSD releases receiving [security support](https://www.freebsd.org/security/#sup) are supported by OpenZFS. -**Supported FreeBSD releases**: **15.0**, **14.4**, **13.5**. +**Supported FreeBSD releases**: **15.0**, **14.4**. diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am index 6f8d0c4b1db..6e54be7466a 100644 --- a/sys/contrib/openzfs/cmd/Makefile.am +++ b/sys/contrib/openzfs/cmd/Makefile.am @@ -54,7 +54,6 @@ ztest_LDADD = \ libnvpair.la ztest_LDADD += -lm -ztest_LDFLAGS = -pthread include $(srcdir)/%D%/raidz_test/Makefile.am diff --git a/sys/contrib/openzfs/cmd/zarcstat.in b/sys/contrib/openzfs/cmd/zarcstat.in index 8ffd2048116..ad0e12e9fbf 100755 --- a/sys/contrib/openzfs/cmd/zarcstat.in +++ b/sys/contrib/openzfs/cmd/zarcstat.in @@ -565,10 +565,10 @@ def init(): update_hdr_intr() - # check if L2ARC exists + # check if L2ARC exists; fall back to l2_size for older kernels that + # do not export l2_ndev snap_stats() - l2_size = cur.get("l2_size") - if l2_size: + if cur.get("l2_ndev") or cur.get("l2_size"): l2exist = True if desired_cols: diff --git a/sys/contrib/openzfs/cmd/zarcsummary b/sys/contrib/openzfs/cmd/zarcsummary index 24a129d9ca7..5b6e35465ea 100755 --- a/sys/contrib/openzfs/cmd/zarcsummary +++ b/sys/contrib/openzfs/cmd/zarcsummary @@ -856,7 +856,10 @@ def section_l2arc(kstats_dict): # The L2ARC statistics live in the same section as the normal ARC stuff arc_stats = isolate_section('arcstats', kstats_dict) - if arc_stats['l2_size'] == '0': + # Skip the section only when no cache device is attached. Fall back to + # l2_size for older kernels that do not export l2_ndev. + if arc_stats.get('l2_ndev', '0') == '0' and \ + arc_stats['l2_size'] == '0': print('L2ARC not detected, skipping section\n') return diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 68c9696a8aa..05e005d929a 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -2802,18 +2802,18 @@ print_file_layout_raidz(vdev_t *vd, blkptr_t *bp, uint64_t file_offset, vd->vdev_children, vdrz->vd_nparity); raidz_row_t *rr = rm->rm_row[0]; - /* - * Account for out of order disks in raidz1. - * For now just reverse them back and adjust for it later. - */ - if (rr->rr_firstdatacol == 1 && (zio.io_offset & (1ULL << 20))) { - uint64_t devidx = rr->rr_col[0].rc_devidx; - rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; - rr->rr_col[1].rc_devidx = devidx; - } - if (!dump_opt['H']) { int last_disk = vd->vdev_children - 1; + /* + * Account for out of order disks in raidz1. + * For now just reverse them back and adjust for it later. + */ + if (rr->rr_firstdatacol == 1 && + (zio.io_offset & (1ULL << 20))) { + uint64_t devidx = rr->rr_col[0].rc_devidx; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[1].rc_devidx = devidx; + } int first_disk = rr->rr_col[0].rc_devidx; (void) printf("%12llx", (u_longlong_t)file_offset); @@ -2843,23 +2843,49 @@ print_file_layout_raidz(vdev_t *vd, blkptr_t *bp, uint64_t file_offset, static uint64_t next_offset = 0; if (next_offset != file_offset) { - (void) printf("skip hole\t-\t%llx\n", - (u_longlong_t)((file_offset - next_offset) >> - vd->vdev_ashift)); + (void) printf("skip hole\t-\t\t%lld\n", + (u_longlong_t)((file_offset - next_offset) / 512)); } next_offset = file_offset + BP_GET_LSIZE(bp); + uint64_t tmp_offset = file_offset; + for (int c = 0; c < rr->rr_cols; c++) { + boolean_t pcol = c < rr->rr_firstdatacol; raidz_col_t *rc = &rr->rr_col[c]; char *path = vd->vdev_child[rc->rc_devidx]->vdev_path; - // c < rr->rr_firstdatacol + if (rc->rc_size == 0) continue; - (void) printf("%s\t%llu\t%d\n", + (void) printf("%s\t\t%llu\t%d", zfs_basename(path), (u_longlong_t)(rc->rc_offset + VDEV_LABEL_START_SIZE)/512, (int)rc->rc_size/512); + if (dump_opt['v']) { + char label = pcol ? 'P' : 'D'; + int num; + + if (c < 2) { + num = 0; + } else { + num = pcol ? c : + (c - rr->rr_firstdatacol); + } + printf("\t%c%d", label, num); + if (dump_opt['v'] > 1) { + unsigned long long off; + if (pcol) + off = file_offset; + else + off = tmp_offset; + off = off / 512ULL; + printf("\t%llu", off); + } + } + if (!pcol) + tmp_offset += rc->rc_size; + printf("\n"); } } } @@ -2989,7 +3015,12 @@ dump_indirect_layout(dnode_t *dn) * Start layout with a header */ if (dump_opt['H']) { - (void) printf("DISK\t\tLBA\t\tCOUNT\n"); + (void) printf("DISK\t\t\tLBA\tCOUNT"); + if (dump_opt['v']) + (void) printf("\tTYPE"); + if (dump_opt['v'] > 1) + (void) printf("\tOFFSET"); + printf("\n"); } else { char diskhdr[16]; @@ -6325,22 +6356,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { int i; + boolean_t claimed = B_FALSE; + boolean_t ddt_block = B_FALSE; + boolean_t brt_block = B_FALSE; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; - /* - * This flag controls if we will issue a claim for the block while - * counting it, to ensure that all blocks are referenced in space maps. - * We don't issue claims if we're not doing leak tracking, because it's - * expensive if the user isn't interested. We also don't claim the - * second or later occurences of cloned or dedup'd blocks, because we - * already claimed them the first time. - */ - boolean_t do_claim = !dump_opt['L']; - spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); blkptr_t tempbp; @@ -6371,21 +6395,30 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE); /* - * ddt_lookup() can return NULL if this block didn't exist - * in the DDT and creating it would take the DDT over its - * quota. Since we got the block from disk, it must exist in - * the DDT, so this can't happen. However, when unique entries - * are pruned, the dedup bit can be set with no corresponding - * entry in the DDT. + * ddt_lookup() can return NULL when unique entries are pruned + * from the DDT. */ if (dde == NULL) { ddt_exit(ddt); - goto skipped; + goto ddt_done; } /* Get the phys for this variant */ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + /* + * DDT_PHYS_NONE means the block has the dedup bit set but + * its DVA doesn't match any phys in the entry. This can + * happen when a DVA was evicted from the DDT and re-added + * on a hash collision. The block may still have a BRT entry. + */ + if (v == DDT_PHYS_NONE) { + ddt_exit(ddt); + goto ddt_done; + } + + ddt_block = B_TRUE; + /* * This entry may have multiple sets of DVAs. We must claim * each set the first time we see them in a real block on disk, @@ -6400,8 +6433,14 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dde->dde_io = (void *)(((uintptr_t)dde->dde_io) | (1 << v)); - /* Consume a reference for this block. */ - if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) + /* + * Consume a reference. If this variant's refcount is already + * zero, the DDT tracking is exhausted — more filesystem + * references exist than the DDT accounts for. + */ + boolean_t ddt_refcnt_exhausted = + (ddt_phys_refcnt(dde->dde_phys, v) == 0); + if (!ddt_refcnt_exhausted) ddt_phys_decref(dde->dde_phys, v); /* @@ -6430,20 +6469,21 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, bp = &tempbp; } - if (seen) { + if (seen && !ddt_refcnt_exhausted) { /* * The second or later time we see this block, * it's a duplicate and we count it. */ zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); zcb->zcb_dedup_blocks++; - - /* Already claimed, don't do it again. */ - do_claim = B_FALSE; + claimed = B_TRUE; } ddt_exit(ddt); - } else if (zcb->zcb_brt_is_active && + } + +ddt_done: + if (!claimed && zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { /* * Cloned blocks are special. We need to count them, so we can @@ -6451,10 +6491,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, * only claim them once. * * To do this, we keep our own in-memory BRT. For each block - * we haven't seen before, we look it up in the real BRT and - * if its there, we note it and its refcount then proceed as - * normal. If we see the block again, we count it as a clone - * and then give it no further consideration. + * we haven't seen before, we look it up in the real BRT. If + * we see the block again, we count it as a clone. */ zdb_brt_entry_t zbre_search, *zbre; avl_index_t where; @@ -6462,36 +6500,27 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zbre_search.zbre_dva = bp->blk_dva[0]; zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); if (zbre == NULL) { - /* Not seen before; track it */ uint64_t refcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); if (refcnt > 0) { + brt_block = B_TRUE; zbre = umem_zalloc(sizeof (zdb_brt_entry_t), UMEM_NOFAIL); zbre->zbre_dva = bp->blk_dva[0]; zbre->zbre_refcount = refcnt; avl_insert(&zcb->zcb_brt, zbre, where); } - } else { - /* - * Second or later occurrence, count it and take a - * refcount. - */ - zcb->zcb_clone_asize += BP_GET_ASIZE(bp); - zcb->zcb_clone_blocks++; - - zbre->zbre_refcount--; - if (zbre->zbre_refcount == 0) { - avl_remove(&zcb->zcb_brt, zbre); - umem_free(zbre, sizeof (zdb_brt_entry_t)); + } else { + brt_block = B_TRUE; + if (zbre->zbre_refcount > 0) { + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + zbre->zbre_refcount--; + claimed = B_TRUE; } - - /* Already claimed, don't do it again. */ - do_claim = B_FALSE; } } -skipped: for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -6650,12 +6679,21 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, #undef BIN hist_skipped: - if (!do_claim) + if (claimed || dump_opt['L']) return; - VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, + int claim_err = zio_wait(zio_claim(NULL, zcb->zcb_spa, spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, - ZIO_FLAG_CANFAIL))); + ZIO_FLAG_CANFAIL)); + if (claim_err != 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("block claim error %d%s%s: %s\n", + claim_err, brt_block ? " (BRT)" : "", + ddt_block ? " (DDT)" : "", blkbuf); + zcb->zcb_haderrors = 1; + zcb->zcb_errors[claim_err]++; + } } static void @@ -7431,10 +7469,66 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { - if (dump_opt['L']) - return (B_FALSE); - boolean_t leaks = B_FALSE; + + /* + * Report leaked BRT entries whose refcount was not fully consumed by + * the traversal. + */ + if (zcb->zcb_brt_is_active) { + void *cookie = NULL; + zdb_brt_entry_t *zbre; + while ((zbre = avl_destroy_nodes( + &zcb->zcb_brt, &cookie)) != NULL) { + if (!dump_opt['L'] && zbre->zbre_refcount != 0) { + (void) printf("BRT leak: vdev %llu, " + "offset 0x%llx, refcount %llu\n", + (u_longlong_t)DVA_GET_VDEV( + &zbre->zbre_dva), + (u_longlong_t)DVA_GET_OFFSET( + &zbre->zbre_dva), + (u_longlong_t)zbre->zbre_refcount); + leaks = B_TRUE; + } + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + avl_destroy(&zcb->zcb_brt); + } + + if (dump_opt['L']) + return (leaks); + + /* + * Report leaked DDT entries whose refcount was not fully consumed by + * the traversal. Entries in the DDT ZAP that were never looked up + * are not detected here. + */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_enter(ddt); + for (ddt_entry_t *dde = avl_first(&ddt->ddt_tree); dde != NULL; + dde = AVL_NEXT(&ddt->ddt_tree, dde)) { + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t refcnt = ddt_phys_refcnt(dde->dde_phys, + v); + if (refcnt == 0) + continue; + blkptr_t blk; + char blkbuf[BP_SPRINTF_LEN]; + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, + dde->dde_phys, v, &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); + (void) printf("DDT leak: refcount %llu %s\n", + (u_longlong_t)refcnt, blkbuf); + leaks = B_TRUE; + } + } + ddt_exit(ddt); + } + vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; @@ -10136,7 +10230,7 @@ main(int argc, char **argv) * Automate cachefile */ if (!spa_config_path_env && !config_path_console && target && - libzfs_core_init() == 0) { + !dump_opt['l'] && libzfs_core_init() == 0) { char *pname = strdup(target); const char *value; nvlist_t *pnvl = NULL; @@ -10519,6 +10613,7 @@ main(int argc, char **argv) } if (dump_opt['f'] && os != NULL) { + dump_opt['v'] = verbose; dump_file_data_layout(os); } else if (dump_opt['B']) { dump_backup(target, objset_id, diff --git a/sys/contrib/openzfs/cmd/zed/Makefile.am b/sys/contrib/openzfs/cmd/zed/Makefile.am index 0166d072356..712917401a0 100644 --- a/sys/contrib/openzfs/cmd/zed/Makefile.am +++ b/sys/contrib/openzfs/cmd/zed/Makefile.am @@ -41,6 +41,5 @@ zed_LDADD = \ libnvpair.la zed_LDADD += -lrt $(LIBATOMIC_LIBS) $(LIBUDEV_LIBS) $(LIBUUID_LIBS) -zed_LDFLAGS = -pthread dist_noinst_DATA += %D%/agents/README.md diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c index 8aabf6d3bf7..ba3672a30a7 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c @@ -350,6 +350,60 @@ is_draid_fdomain_failure(fmd_hdl_t *hdl, libzfs_handle_t *zhdl, return (res); } +/* + * Returns B_TRUE if spare 'a' should be tried before spare 'b' when + * replacing a failed vdev with the given characteristics. + * + * Ordering criteria (most to least significant): + * 1. Distributed spare matching the failed vdev's dRAID is preferred + * most (distributed spares rebuild faster than traditional spares). + * Regular spares (no TOP_GUID) come next. Non-matching distributed + * spares are tried last, as the kernel will reject them anyway. + * 2. Matching rotational is preferred over mismatching. + * 3. Large enough is preferred over too small. + * 4. Smaller size is preferred over bigger (best fit). + */ +static boolean_t +spare_is_preferred(nvlist_t *a, nvlist_t *b, boolean_t have_rotational, + uint64_t vdev_rotational, uint64_t vdev_size, uint64_t top_guid) +{ + uint64_t a_top = 0, b_top = 0; + (void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_TOP_GUID, &a_top); + (void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_TOP_GUID, &b_top); + int a_pri = (a_top == 0) ? 1 : + (a_top == top_guid || top_guid == 0) ? 2 : 0; + int b_pri = (b_top == 0) ? 1 : + (b_top == top_guid || top_guid == 0) ? 2 : 0; + if (a_pri != b_pri) + return (a_pri > b_pri); + + if (have_rotational) { + uint64_t a_rotational = 0, b_rotational = 0; + (void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_VDEV_ROTATIONAL, + &a_rotational); + (void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_VDEV_ROTATIONAL, + &b_rotational); + if ((a_rotational == vdev_rotational) != + (b_rotational == vdev_rotational)) + return (a_rotational == vdev_rotational); + } + + vdev_stat_t *vs; + unsigned int c; + uint64_t a_size = 0, b_size = 0; + if (nvlist_lookup_uint64_array(a, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0) + a_size = vs->vs_rsize; + if (nvlist_lookup_uint64_array(b, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0) + b_size = vs->vs_rsize; + boolean_t a_ok = (a_size >= vdev_size); + boolean_t b_ok = (b_size >= vdev_size); + if (a_ok != b_ok) + return (a_ok); + return (a_size < b_size); +} + /* * Given a vdev, attempt to replace it with every known spare until one * succeeds or we run out of devices to try. @@ -364,6 +418,10 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) char *dev_name; zprop_source_t source; int ashift; + uint64_t vdev_rotational = 0, vdev_size = 0, top_guid = 0; + boolean_t have_vdev_rotational; + vdev_stat_t *vs; + unsigned int c; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, @@ -377,6 +435,35 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) &spares, &nspares) != 0) return (B_FALSE); + /* + * Collect the failed vdev's parameters for optimal replacement. + */ + have_vdev_rotational = (nvlist_lookup_uint64(vdev, + ZPOOL_CONFIG_VDEV_ROTATIONAL, &vdev_rotational) == 0); + if (nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0) + vdev_size = vs->vs_rsize; + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_TOP_GUID, &top_guid); + + /* + * Build a sorted index array over the spares, so that better + * candicates are tried first. + */ + uint_t order[nspares]; + for (s = 0; s < nspares; s++) + order[s] = s; + for (s = 1; s < nspares; s++) { + uint_t key = order[s]; + int j = (int)s - 1; + while (j >= 0 && spare_is_preferred(spares[key], + spares[order[j]], have_vdev_rotational, vdev_rotational, + vdev_size, top_guid)) { + order[j + 1] = order[j]; + j--; + } + order[j + 1] = key; + } + /* * lookup "ashift" pool property, we may need it for the replacement */ @@ -394,25 +481,26 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) * replace it. */ for (s = 0; s < nspares; s++) { + nvlist_t *spare = spares[order[s]]; boolean_t rebuild = B_FALSE; const char *spare_name, *type; - if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + if (nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &spare_name) != 0) continue; /* prefer sequential resilvering for distributed spares */ - if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, + if ((nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) rebuild = B_TRUE; /* if set, add the "ashift" pool property to the spare nvlist */ if (source != ZPROP_SRC_DEFAULT) - (void) nvlist_add_uint64(spares[s], + (void) nvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, ashift); (void) nvlist_add_nvlist_array(replacement, - ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1); + ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spare, 1); fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", dev_name, zfs_basename(spare_name)); diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index 4c21c92bcd2..d448a3df60e 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -9399,6 +9399,18 @@ main(int argc, char **argv) return (1); } + /* + * Special case ' --help|-?' + */ + if (argc >= 3 && (strcmp(argv[2], "--help") == 0 || + strcmp(argv[2], "-?") == 0)) { + int idx; + if (find_command_idx(cmdname, &idx) == 0) { + current_command = &command_table[idx]; + usage(B_FALSE); + } + } + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 3ed7babc1ca..05ea5e35446 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -13878,6 +13878,18 @@ main(int argc, char **argv) if (strcmp(cmdname, "help") == 0) return (zpool_do_help(argc, argv)); + /* + * Special case ' --help|-?' + */ + if (argc >= 3 && (strcmp(argv[2], "--help") == 0 || + strcmp(argv[2], "-?") == 0)) { + int idx; + if (find_command_idx(cmdname, &idx) == 0) { + current_command = &command_table[idx]; + usage(B_FALSE); + } + } + if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.c b/sys/contrib/openzfs/cmd/zstream/zstream.c index f1a2fa75740..da74ab6e1e5 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include "zstream.h" void @@ -53,9 +55,43 @@ zstream_usage(void) exit(1); } +static void sig_handler(int signo) +{ + struct sigaction action; + libspl_backtrace(STDERR_FILENO); + + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + raise(signo); +} + + int main(int argc, char *argv[]) { + /* + * Set up signal handlers, so if we crash due to bad data in the stream + * we can get more info. Unlike ztest, we don't bail out if we can't + * set up signal handlers, because zstream is very useful without them. + */ + struct sigaction action = { .sa_handler = sig_handler }; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGSEGV, &action, NULL) < 0) { + (void) fprintf(stderr, "zstream: cannot catch SIGSEGV: %s\n", + strerror(errno)); + } + if (sigaction(SIGABRT, &action, NULL) < 0) { + (void) fprintf(stderr, "zstream: cannot catch SIGABRT: %s\n", + strerror(errno)); + } + char *basename = strrchr(argv[0], '/'); basename = basename ? (basename + 1) : argv[0]; if (argc >= 1 && strcmp(basename, "zstreamdump") == 0) diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_dump.c b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c index 6ccc57204c8..7757ee3b175 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream_dump.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c @@ -385,6 +385,20 @@ zstream_do_dump(int argc, char *argv[]) (void) ssread(buf, sz, &zc); if (ferror(send_stream)) perror("fread"); + + uint8_t *nv_header = (uint8_t *)buf; + boolean_t xdr = nv_header[0] == NV_ENCODE_XDR; + boolean_t big_endian = nv_header[1] == 0; + const char *nc; + if (xdr) { + nc = "NV_ENCODE_XDR"; + } else if (big_endian) { + nc = "NV_ENCODE_NATIVE (big-endian)"; + } else { + nc = "NV_ENCODE_NATIVE (little-endian)"; + } + printf("nvlist encoding = %s\n", nc); + err = nvlist_unpack(buf, sz, &nv, 0); if (err) { perror(strerror(err)); diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c b/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c index 5092b534a8f..f5abfa98b18 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c @@ -99,6 +99,7 @@ zstream_do_recompress(int argc, char *argv[]) exit(1); } + zfs_refcount_init(); abd_init(); fletcher_4_init(); zio_init(); @@ -353,6 +354,7 @@ zstream_do_recompress(int argc, char *argv[]) zio_fini(); zstd_fini(); abd_fini(); + zfs_refcount_fini(); return (0); } diff --git a/sys/contrib/openzfs/config/Rules.am b/sys/contrib/openzfs/config/Rules.am index 5117929cac5..c4a9641f58f 100644 --- a/sys/contrib/openzfs/config/Rules.am +++ b/sys/contrib/openzfs/config/Rules.am @@ -23,6 +23,7 @@ AM_CFLAGS += $(IMPLICIT_FALLTHROUGH) AM_CFLAGS += $(DEBUG_CFLAGS) AM_CFLAGS += $(ASAN_CFLAGS) AM_CFLAGS += $(UBSAN_CFLAGS) +AM_CFLAGS += $(PTHREAD_CFLAGS) AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) AM_CFLAGS += $(NO_FORMAT_ZERO_LENGTH) AM_CFLAGS += $(NO_FORMAT_TRUNCATION) @@ -57,6 +58,7 @@ endif AM_LDFLAGS = $(DEBUG_LDFLAGS) AM_LDFLAGS += $(ASAN_LDFLAGS) AM_LDFLAGS += $(UBSAN_LDFLAGS) +AM_LDFLAGS += $(PTHREAD_LIBS) if BUILD_FREEBSD AM_LDFLAGS += -fstack-protector-strong diff --git a/sys/contrib/openzfs/config/ax_pthread.m4 b/sys/contrib/openzfs/config/ax_pthread.m4 new file mode 100644 index 00000000000..daea8c5987e --- /dev/null +++ b/sys/contrib/openzfs/config/ax_pthread.m4 @@ -0,0 +1,523 @@ +# SPDX-License-Identifier: GPL-3.0-or-later WITH Autoconf-exception-macro +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_pthread.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) +# +# DESCRIPTION +# +# This macro figures out how to build C programs using POSIX threads. It +# sets the PTHREAD_LIBS output variable to the threads library and linker +# flags, and the PTHREAD_CFLAGS output variable to any special C compiler +# flags that are needed. (The user can also force certain compiler +# flags/libs to be tested by setting these environment variables.) +# +# Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is +# needed for multi-threaded programs (defaults to the value of CC +# respectively CXX otherwise). (This is necessary on e.g. AIX to use the +# special cc_r/CC_r compiler alias.) +# +# NOTE: You are assumed to not only compile your program with these flags, +# but also to link with them as well. For example, you might link with +# $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS +# $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS +# +# If you are only building threaded programs, you may wish to use these +# variables in your default LIBS, CFLAGS, and CC: +# +# LIBS="$PTHREAD_LIBS $LIBS" +# CFLAGS="$CFLAGS $PTHREAD_CFLAGS" +# CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS" +# CC="$PTHREAD_CC" +# CXX="$PTHREAD_CXX" +# +# In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant +# has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to +# that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). +# +# Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the +# PTHREAD_PRIO_INHERIT symbol is defined when compiling with +# PTHREAD_CFLAGS. +# +# ACTION-IF-FOUND is a list of shell commands to run if a threads library +# is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it +# is not found. If ACTION-IF-FOUND is not specified, the default action +# will define HAVE_PTHREAD. +# +# Please let the authors know if this macro fails on any platform, or if +# you have any other suggestions or comments. This macro was based on work +# by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help +# from M. Frigo), as well as ac_pthread and hb_pthread macros posted by +# Alejandro Forero Cuervo to the autoconf macro repository. We are also +# grateful for the helpful feedback of numerous users. +# +# Updated for Autoconf 2.68 by Daniel Richard G. +# +# LICENSE +# +# Copyright (c) 2008 Steven G. Johnson +# Copyright (c) 2011 Daniel Richard G. +# Copyright (c) 2019 Marc Stevens +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 31 + +AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD]) +AC_DEFUN([AX_PTHREAD], [ +AC_REQUIRE([AC_CANONICAL_HOST]) +AC_REQUIRE([AC_PROG_CC]) +AC_REQUIRE([AC_PROG_SED]) +AC_LANG_PUSH([C]) +ax_pthread_ok=no + +# We used to check for pthread.h first, but this fails if pthread.h +# requires special compiler flags (e.g. on Tru64 or Sequent). +# It gets checked for in the link test anyway. + +# First of all, check if the user has set any of the PTHREAD_LIBS, +# etcetera environment variables, and if threads linking works using +# them: +if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then + ax_pthread_save_CC="$CC" + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"]) + AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"]) + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS]) + AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes]) + AC_MSG_RESULT([$ax_pthread_ok]) + if test "x$ax_pthread_ok" = "xno"; then + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" + fi + CC="$ax_pthread_save_CC" + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" +fi + +# We must check for the threads library under a number of different +# names; the ordering is very important because some systems +# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# libraries is broken (non-POSIX). + +# Create a list of thread flags to try. Items with a "," contain both +# C compiler flags (before ",") and linker flags (after ","). Other items +# starting with a "-" are C compiler flags, and remaining items are +# library names, except for "none" which indicates that we try without +# any flags at all, and "pthread-config" which is a program returning +# the flags for the Pth emulation library. + +ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" + +# The ordering *is* (sometimes) important. Some notes on the +# individual items follow: + +# pthreads: AIX (must check this before -lpthread) +# none: in case threads are in libc; should be tried before -Kthread and +# other compiler flags to prevent continual compiler warnings +# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64 +# (Note: HP C rejects this with "bad form for `-t' option") +# -pthreads: Solaris/gcc (Note: HP C also rejects) +# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it +# doesn't hurt to check since this sometimes defines pthreads and +# -D_REENTRANT too), HP C (must be checked before -lpthread, which +# is present but should not be used directly; and before -mthreads, +# because the compiler interprets this as "-mt" + "-hreads") +# -mthreads: Mingw32/gcc, Lynx/gcc +# pthread: Linux, etcetera +# --thread-safe: KAI C++ +# pthread-config: use pthread-config program (for GNU Pth library) + +case $host_os in + + freebsd*) + + # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) + # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) + + ax_pthread_flags="-kthread lthread $ax_pthread_flags" + ;; + + hpux*) + + # From the cc(1) man page: "[-mt] Sets various -D flags to enable + # multi-threading and also sets -lpthread." + + ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags" + ;; + + openedition*) + + # IBM z/OS requires a feature-test macro to be defined in order to + # enable POSIX threads at all, so give the user a hint if this is + # not set. (We don't define these ourselves, as they can affect + # other portions of the system API in unpredictable ways.) + + AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING], + [ +# if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS) + AX_PTHREAD_ZOS_MISSING +# endif + ], + [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])]) + ;; + + solaris*) + + # On Solaris (at least, for some versions), libc contains stubbed + # (non-functional) versions of the pthreads routines, so link-based + # tests will erroneously succeed. (N.B.: The stubs are missing + # pthread_cleanup_push, or rather a function called by this macro, + # so we could check for that, but who knows whether they'll stub + # that too in a future libc.) So we'll check first for the + # standard Solaris way of linking pthreads (-mt -lpthread). + + ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags" + ;; +esac + +# Are we compiling with Clang? + +AC_CACHE_CHECK([whether $CC is Clang], + [ax_cv_PTHREAD_CLANG], + [ax_cv_PTHREAD_CLANG=no + # Note that Autoconf sets GCC=yes for Clang as well as GCC + if test "x$GCC" = "xyes"; then + AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG], + [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */ +# if defined(__clang__) && defined(__llvm__) + AX_PTHREAD_CC_IS_CLANG +# endif + ], + [ax_cv_PTHREAD_CLANG=yes]) + fi + ]) +ax_pthread_clang="$ax_cv_PTHREAD_CLANG" + + +# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC) + +# Note that for GCC and Clang -pthread generally implies -lpthread, +# except when -nostdlib is passed. +# This is problematic using libtool to build C++ shared libraries with pthread: +# [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460 +# [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333 +# [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555 +# To solve this, first try -pthread together with -lpthread for GCC + +AS_IF([test "x$GCC" = "xyes"], + [ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"]) + +# Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first + +AS_IF([test "x$ax_pthread_clang" = "xyes"], + [ax_pthread_flags="-pthread,-lpthread -pthread"]) + + +# The presence of a feature test macro requesting re-entrant function +# definitions is, on some systems, a strong hint that pthreads support is +# correctly enabled + +case $host_os in + darwin* | hpux* | linux* | osf* | solaris*) + ax_pthread_check_macro="_REENTRANT" + ;; + + aix*) + ax_pthread_check_macro="_THREAD_SAFE" + ;; + + *) + ax_pthread_check_macro="--" + ;; +esac +AS_IF([test "x$ax_pthread_check_macro" = "x--"], + [ax_pthread_check_cond=0], + [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"]) + + +if test "x$ax_pthread_ok" = "xno"; then +for ax_pthread_try_flag in $ax_pthread_flags; do + + case $ax_pthread_try_flag in + none) + AC_MSG_CHECKING([whether pthreads work without any flags]) + ;; + + *,*) + PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"` + PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"` + AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"]) + ;; + + -*) + AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag]) + PTHREAD_CFLAGS="$ax_pthread_try_flag" + ;; + + pthread-config) + AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no]) + AS_IF([test "x$ax_pthread_config" = "xno"], [continue]) + PTHREAD_CFLAGS="`pthread-config --cflags`" + PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" + ;; + + *) + AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag]) + PTHREAD_LIBS="-l$ax_pthread_try_flag" + ;; + esac + + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + + # Check for various functions. We must include pthread.h, + # since some functions may be macros. (On the Sequent, we + # need a special flag -Kthread to make this header compile.) + # We check for pthread_join because it is in -lpthread on IRIX + # while pthread_create is in libc. We check for pthread_attr_init + # due to DEC craziness with -lpthreads. We check for + # pthread_cleanup_push because it is one of the few pthread + # functions on Solaris that doesn't have a non-functional libc stub. + # We try pthread_create on general principles. + + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +# if $ax_pthread_check_cond +# error "$ax_pthread_check_macro must be defined" +# endif + static void *some_global = NULL; + static void routine(void *a) + { + /* To avoid any unused-parameter or + unused-but-set-parameter warning. */ + some_global = a; + } + static void *start_routine(void *a) { return a; }], + [pthread_t th; pthread_attr_t attr; + pthread_create(&th, 0, start_routine, 0); + pthread_join(th, 0); + pthread_attr_init(&attr); + pthread_cleanup_push(routine, 0); + pthread_cleanup_pop(0) /* ; */])], + [ax_pthread_ok=yes], + []) + + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" + + AC_MSG_RESULT([$ax_pthread_ok]) + AS_IF([test "x$ax_pthread_ok" = "xyes"], [break]) + + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" +done +fi + + +# Clang needs special handling, because older versions handle the -pthread +# option in a rather... idiosyncratic way + +if test "x$ax_pthread_clang" = "xyes"; then + + # Clang takes -pthread; it has never supported any other flag + + # (Note 1: This will need to be revisited if a system that Clang + # supports has POSIX threads in a separate library. This tends not + # to be the way of modern systems, but it's conceivable.) + + # (Note 2: On some systems, notably Darwin, -pthread is not needed + # to get POSIX threads support; the API is always present and + # active. We could reasonably leave PTHREAD_CFLAGS empty. But + # -pthread does define _REENTRANT, and while the Darwin headers + # ignore this macro, third-party headers might not.) + + # However, older versions of Clang make a point of warning the user + # that, in an invocation where only linking and no compilation is + # taking place, the -pthread option has no effect ("argument unused + # during compilation"). They expect -pthread to be passed in only + # when source code is being compiled. + # + # Problem is, this is at odds with the way Automake and most other + # C build frameworks function, which is that the same flags used in + # compilation (CFLAGS) are also used in linking. Many systems + # supported by AX_PTHREAD require exactly this for POSIX threads + # support, and in fact it is often not straightforward to specify a + # flag that is used only in the compilation phase and not in + # linking. Such a scenario is extremely rare in practice. + # + # Even though use of the -pthread flag in linking would only print + # a warning, this can be a nuisance for well-run software projects + # that build with -Werror. So if the active version of Clang has + # this misfeature, we search for an option to squash it. + + AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread], + [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG], + [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown + # Create an alternate version of $ac_link that compiles and + # links in two steps (.c -> .o, .o -> exe) instead of one + # (.c -> exe), because the warning occurs only in the second + # step + ax_pthread_save_ac_link="$ac_link" + ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g' + ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"` + ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)" + ax_pthread_save_CFLAGS="$CFLAGS" + for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do + AS_IF([test "x$ax_pthread_try" = "xunknown"], [break]) + CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS" + ac_link="$ax_pthread_save_ac_link" + AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], + [ac_link="$ax_pthread_2step_ac_link" + AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], + [break]) + ]) + done + ac_link="$ax_pthread_save_ac_link" + CFLAGS="$ax_pthread_save_CFLAGS" + AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no]) + ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try" + ]) + + case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in + no | unknown) ;; + *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;; + esac + +fi # $ax_pthread_clang = yes + + + +# Various other checks: +if test "x$ax_pthread_ok" = "xyes"; then + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + + # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. + AC_CACHE_CHECK([for joinable pthread attribute], + [ax_cv_PTHREAD_JOINABLE_ATTR], + [ax_cv_PTHREAD_JOINABLE_ATTR=unknown + for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [int attr = $ax_pthread_attr; return attr /* ; */])], + [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break], + []) + done + ]) + AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \ + test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \ + test "x$ax_pthread_joinable_attr_defined" != "xyes"], + [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], + [$ax_cv_PTHREAD_JOINABLE_ATTR], + [Define to necessary symbol if this constant + uses a non-standard name on your system.]) + ax_pthread_joinable_attr_defined=yes + ]) + + AC_CACHE_CHECK([whether more special flags are required for pthreads], + [ax_cv_PTHREAD_SPECIAL_FLAGS], + [ax_cv_PTHREAD_SPECIAL_FLAGS=no + case $host_os in + solaris*) + ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS" + ;; + esac + ]) + AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \ + test "x$ax_pthread_special_flags_added" != "xyes"], + [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS" + ax_pthread_special_flags_added=yes]) + + AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT], + [ax_cv_PTHREAD_PRIO_INHERIT], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[int i = PTHREAD_PRIO_INHERIT; + return i;]])], + [ax_cv_PTHREAD_PRIO_INHERIT=yes], + [ax_cv_PTHREAD_PRIO_INHERIT=no]) + ]) + AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \ + test "x$ax_pthread_prio_inherit_defined" != "xyes"], + [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.]) + ax_pthread_prio_inherit_defined=yes + ]) + + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" + + # More AIX lossage: compile with *_r variant + if test "x$GCC" != "xyes"; then + case $host_os in + aix*) + AS_CASE(["x/$CC"], + [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6], + [#handle absolute path differently from PATH based program lookup + AS_CASE(["x$CC"], + [x/*], + [ + AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"]) + AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])]) + ], + [ + AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC]) + AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])]) + ] + ) + ]) + ;; + esac + fi +fi + +test -n "$PTHREAD_CC" || PTHREAD_CC="$CC" +test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX" + +AC_SUBST([PTHREAD_LIBS]) +AC_SUBST([PTHREAD_CFLAGS]) +AC_SUBST([PTHREAD_CC]) +AC_SUBST([PTHREAD_CXX]) + +# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: +if test "x$ax_pthread_ok" = "xyes"; then + ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1]) + : +else + ax_pthread_ok=no + $2 +fi +AC_LANG_POP +])dnl AX_PTHREAD diff --git a/sys/contrib/openzfs/config/kernel-fs-parse.m4 b/sys/contrib/openzfs/config/kernel-fs-parse.m4 new file mode 100644 index 00000000000..7a6ffa77238 --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-fs-parse.m4 @@ -0,0 +1,34 @@ +dnl # SPDX-License-Identifier: CDDL-1.0 +dnl # +dnl # 5.6 API change +dnl # Before 5.6, fs_parse() took a struct fs_parameter_description +dnl # which wraps the parameter specs with name and enum pointers. From 5.6, +dnl # the description struct was removed and fs_parse() accepts the +dnl # fs_parameter_spec directly. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FS_PARSE], [ + ZFS_LINUX_TEST_SRC([fs_parse], [ + #include + #include + ],[ + static const struct fs_parameter_spec specs[] = { + {} + }; + int test __attribute__ ((unused)); + struct fs_context *fc __attribute__ ((unused)) = NULL; + struct fs_parameter param __attribute__ ((unused)); + struct fs_parse_result result __attribute__ ((unused)); + test = fs_parse(fc, specs, ¶m, &result); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FS_PARSE], [ + AC_MSG_CHECKING([whether fs_parse() takes fs_parameter_spec directly]) + ZFS_LINUX_TEST_RESULT([fs_parse], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FS_PARSE_TAKES_SPEC, 1, + [fs_parse() takes fs_parameter_spec directly]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index b40e34d373f..55f40767567 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -78,6 +78,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FS_CONTEXT + ZFS_AC_KERNEL_SRC_FS_PARSE ZFS_AC_KERNEL_SRC_SB_DYING ZFS_AC_KERNEL_SRC_SET_NLINK ZFS_AC_KERNEL_SRC_SGET @@ -153,9 +154,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ;; esac - AC_MSG_CHECKING([for available kernel interfaces]) - ZFS_LINUX_TEST_COMPILE_ALL([kabi]) - AC_MSG_RESULT([done]) + ZFS_LINUX_TEST_COMPILE_ALL([kabi], [for available kernel interfaces]) ]) dnl # @@ -203,6 +202,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FS_CONTEXT + ZFS_AC_KERNEL_FS_PARSE ZFS_AC_KERNEL_SB_DYING ZFS_AC_KERNEL_SET_NLINK ZFS_AC_KERNEL_SGET @@ -753,6 +753,108 @@ AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ ], [], [yes]) ]) +dnl # +dnl # Progress output for ZFS_LINUX_TEST_COMPILE_ALL +dnl # +dnl # From clean, we currently have ~250 kernel tests to compile. This can +dnl # take anywhere from a few seconds to a few minutes while we wait for +dnl # the module build invocation to complete (see ZFS_LINUX_COMPILE). +dnl # +dnl # To show some progress in the main set of tests, we start a background +dnl # job to monitor the build progress and update the output. +dnl # +AC_DEFUN([_ZFS_LINUX_TEST_COMPILE_PROGRESS_START], [ + dnl # normal "checking for..." output + AC_MSG_CHECKING([$2]) + + dnl # don't start the background job if configure was called with + dnl # --silent or --quiet, or if configure's output stream is not + dnl # attached to a terminal + AS_IF([test "x$silent" != "xyes" -a -t AS_MESSAGE_FD], [ + dnl # save "checking" message for cleanup later + _zfs_linux_test_progress_text="$2" + + dnl # new shell job in background + ( + dnl # ZFS_LINUX_CONFTEST_MAKEFILE adds one line per + dnl # test to the top Makefile, so the line count + dnl # is our target + total=$(wc -l < $1/Makefile) + count=0 + + dnl # eject if our parent process has gone away. this + dnl # is protection against the parent being killed. + dnl # (we can't use trap because autoconf generates + dnl # that and doesn't provide an easy way to hook it). + while kill -0 $$ 2>/dev/null ; do + + dnl # ZFS_LINUX_TEST_COMPILE_ALL has a short + dnl # second stage for modpost, where build.log + dnl # recreated. we make some effort to both + dnl # detect that and handle it, mostly by + dnl # making sure the counter never goes + dnl # backwards. + if test "$count" -lt "$total" ; then + dnl # if build.log went away, then + dnl # we never got to do a last count, + dnl # so we can assume they're all + dnl # finished and just bump the count + dnl # to the total + if ! test -f $1/build.log ; then + count=$total + else + dnl # look for compilation lines + dnl # (CC) for .o files that + dnl # are in a dir (so not + dnl # whole-of-build artifacts) + dnl # and only have a a single + dnl # period (so not .mod.o + dnl # link artifacts) + count_n=$(awk '/CC/ && /\/[[^\.]]+\.o$/ { c++ } END { print c }' $1/build.log 2>/dev/null) + if test "x$count_n" != "x" ; then + dnl # empty output + dnl # means awk failed, + dnl # likely build.log + dnl # went away. use + dnl # the current count + count=$count_n + fi + fi + + dnl # re-output the entire message with + dnl # the new counts + printf '\rchecking %s... %d/%d' "$2" "$count" "$total" >&6 + fi + + dnl # yield before loop + sleep 0.5 + done + ) & + + dnl # save the pid so we can kill it later + _zfs_linux_test_progress_pid=$! + ]) +]) + +AC_DEFUN([_ZFS_LINUX_TEST_COMPILE_PROGRESS_DONE], [ + dnl # only do cleanup if we actually started the job + AS_IF([test "x$_zfs_linux_test_progress_pid" != "x"], [ + dnl # kill it; no-op if it already died + kill $_zfs_linux_test_progress_pid 2>/dev/null + dnl # wait for it to really go away and clean it up + wait $_zfs_linux_test_progress_pid 2>/dev/null + dnl # reprint the original checking line. the control code + dnl # is ANSI "erase entire line" + printf '\r\033\1332Kchecking %s... ' "$_zfs_linux_test_progress_text" >&AS_MESSAGE_FD + dnl # cleanup for next run + _zfs_linux_test_progress_pid= + _zfs_linux_test_progress_text= + ]) + + dnl # normal final output for screen and config.log + AC_MSG_RESULT([$1]) +]) + dnl # dnl # Perform the compilation of the test cases in two phases. dnl # @@ -771,6 +873,10 @@ dnl # The maximum allowed parallelism can be controlled by setting the dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ + AS_IF([test "x$2" != "x"], [ + _ZFS_LINUX_TEST_COMPILE_PROGRESS_START([build], [$2]) + ]) + dnl # Phase 1 - Compilation only, final linking is skipped. ZFS_LINUX_TEST_COMPILE([$1], [build]) @@ -818,6 +924,10 @@ AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ ]) done ]) + + AS_IF([test "x$2" != "x"], [ + _ZFS_LINUX_TEST_COMPILE_PROGRESS_DONE([done]) + ]) ]) dnl # diff --git a/sys/contrib/openzfs/config/zfs-build.m4 b/sys/contrib/openzfs/config/zfs-build.m4 index 1e1485c4cf4..7aabad3868b 100644 --- a/sys/contrib/openzfs/config/zfs-build.m4 +++ b/sys/contrib/openzfs/config/zfs-build.m4 @@ -39,6 +39,18 @@ dnl # (If INVARIANTS is detected, we need to force DEBUG, or strange panics dnl # can ensue.) dnl # AC_DEFUN([ZFS_AC_DEBUG], [ + dnl # + dnl # In the Linux kernel copy-builtin build, assertion/debug support + dnl # is selected by CONFIG_ZFS_DEBUG (Kconfig). + dnl # + AH_BOTTOM([ +#ifdef CONFIG_ZFS +#undef ZFS_DEBUG +#ifdef CONFIG_ZFS_DEBUG +#define ZFS_DEBUG 1 +#endif +#endif]) + AC_MSG_CHECKING([whether assertion support will be enabled]) AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], diff --git a/sys/contrib/openzfs/configure.ac b/sys/contrib/openzfs/configure.ac index 3757b5e2cac..74e4ab3bdf8 100644 --- a/sys/contrib/openzfs/configure.ac +++ b/sys/contrib/openzfs/configure.ac @@ -54,6 +54,7 @@ AC_PROG_LN_S PKG_PROG_PKG_CONFIG AM_PROG_AS AM_PROG_CC_C_O +AX_PTHREAD AX_CODE_COVERAGE _AM_PROG_TAR(pax) diff --git a/sys/contrib/openzfs/contrib/debian/not-installed b/sys/contrib/openzfs/contrib/debian/not-installed index 9c08da5a6a7..efe17c90c3b 100644 --- a/sys/contrib/openzfs/contrib/debian/not-installed +++ b/sys/contrib/openzfs/contrib/debian/not-installed @@ -2,7 +2,6 @@ usr/bin/zarcsummary.py usr/share/zfs/zfs-helpers.sh etc/default/zfs etc/init.d -etc/sudoers.d etc/zfs/vdev_id.conf.alias.example etc/zfs/vdev_id.conf.multipath.example etc/zfs/vdev_id.conf.sas_direct.example diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c index d5513b7a43f..5477c7dc611 100644 --- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c +++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c @@ -840,27 +840,41 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh, errno); return (-1); } - if (chown(runtime_path, 0, 0) != 0) { - pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d", - errno); + const int runtime_fd = open(runtime_path, + O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_DIRECTORY); + if (runtime_fd < 0) { + pam_syslog(pamh, LOG_ERR, "Can't open runtime path: %d", errno); return (-1); } - if (chmod(runtime_path, S_IRWXU) != 0) { + if (fchown(runtime_fd, 0, 0) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d", + errno); + close(runtime_fd); + return (-1); + } + if (fchmod(runtime_fd, S_IRWXU) != 0) { pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d", errno); + close(runtime_fd); return (-1); } char *counter_path; - if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1) + if (asprintf(&counter_path, "%u", config->uid) == -1) { + close(runtime_fd); return (-1); + } - const int fd = open(counter_path, + const int fd = openat(runtime_fd, counter_path, O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, S_IRUSR | S_IWUSR); + int ret = errno; + free(counter_path); + close(runtime_fd); + if (fd < 0) { - pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno); + pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", ret); return (-1); } if (flock(fd, LOCK_EX) != 0) { @@ -871,7 +885,6 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh, char counter[20]; char *pos = counter; int remaining = sizeof (counter) - 1; - int ret; counter[sizeof (counter) - 1] = 0; while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) { remaining -= ret; diff --git a/sys/contrib/openzfs/copy-builtin b/sys/contrib/openzfs/copy-builtin index 9a430bfb289..d412437f556 100755 --- a/sys/contrib/openzfs/copy-builtin +++ b/sys/contrib/openzfs/copy-builtin @@ -43,6 +43,17 @@ config ZFS To compile this file system support as a module, choose M here. + If unsure, say N. + +config ZFS_DEBUG + bool "ZFS debugging" + depends on ZFS + help + Enable ZFS debugging. This turns on all ASSERT() assertions, + enables additional debug-only code paths, and promotes + compiler warnings to errors. This should only be enabled for + development or troubleshooting. + If unsure, say N. EOF diff --git a/sys/contrib/openzfs/etc/Makefile.am b/sys/contrib/openzfs/etc/Makefile.am index 58b3cf563b6..5168c3cde13 100644 --- a/sys/contrib/openzfs/etc/Makefile.am +++ b/sys/contrib/openzfs/etc/Makefile.am @@ -1,10 +1,4 @@ # SPDX-License-Identifier: CDDL-1.0 -sudoersddir = $(sysconfdir)/sudoers.d -sudoersd_DATA = \ - %D%/sudoers.d/zfs - -dist_noinst_DATA += $(sudoersd_DATA) - sysconf_zfsdir = $(sysconfdir)/zfs @@ -88,8 +82,6 @@ systemdgenerator_PROGRAMS = \ %C%_systemd_system_generators_zfs_mount_generator_LDADD = \ libzfs.la -%C%_systemd_system_generators_zfs_mount_generator_LDFLAGS = -pthread - CPPCHECKTARGETS += $(systemdgenerator_PROGRAMS) endif diff --git a/sys/contrib/openzfs/etc/sudoers.d/zfs b/sys/contrib/openzfs/etc/sudoers.d/zfs deleted file mode 100644 index 82a25ba81ec..00000000000 --- a/sys/contrib/openzfs/etc/sudoers.d/zfs +++ /dev/null @@ -1,9 +0,0 @@ -## -## Allow any user to run `zpool iostat/status -c smart` in order -## to read basic SMART health statistics for a pool. -## -## CAUTION: Any syntax error introduced here will break sudo. -## Editing with 'visudo' is recommended: visudo -f /etc/sudoers.d/zfs -## - -# ALL ALL = (root) NOPASSWD: /usr/sbin/smartctl -a /dev/[hsv]d[a-z0-9]* diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/arc_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/arc_os.h index ad2aba23b90..6334d453f48 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/arc_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/arc_os.h @@ -29,6 +29,5 @@ #define _SYS_ARC_OS_H int param_set_arc_free_target(SYSCTL_HANDLER_ARGS); -int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); #endif diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h b/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h index c883836c2f8..64361bea90e 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h @@ -30,7 +30,6 @@ #include typedef enum { - RW_DRIVER = 2, RW_DEFAULT = 4, RW_NOLOCKDEP = 5 } krw_type_t; @@ -75,20 +74,35 @@ spl_rw_set_type(krwlock_t *rwp, krw_type_t type) { rwp->rw_type = type; } + +static inline void +spl_rw_lockdep_off(void) +{ + lockdep_off(); +} + +static inline void +spl_rw_lockdep_on(void) +{ + lockdep_on(); +} + static inline void spl_rw_lockdep_off_maybe(krwlock_t *rwp) \ { \ if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ - lockdep_off(); \ + spl_rw_lockdep_off(); \ } static inline void spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ { \ if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ - lockdep_on(); \ + spl_rw_lockdep_on(); \ } #else /* CONFIG_LOCKDEP */ #define spl_rw_set_type(rwp, type) +#define spl_rw_lockdep_off() +#define spl_rw_lockdep_on() #define spl_rw_lockdep_off_maybe(rwp) #define spl_rw_lockdep_on_maybe(rwp) #endif /* CONFIG_LOCKDEP */ @@ -117,6 +131,56 @@ RW_READ_HELD(krwlock_t *rwp) * will be correctly located in the users code which is important * for the built in kernel lock analysis tools */ +#define spl_rw_tryenter_impl(rwp, rw) /* CSTYLED */ \ +({ \ + int _rc_ = 0; \ + \ + switch (rw) { \ + case RW_READER: \ + _rc_ = down_read_trylock(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + if ((_rc_ = down_write_trylock(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + _rc_; \ +}) + +#define spl_rw_enter_impl(rwp, rw) /* CSTYLED */ \ +({ \ + switch (rw) { \ + case RW_READER: \ + down_read(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + down_write(SEM(rwp)); \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ +}) + +#define spl_rw_exit_impl(rwp) /* CSTYLED */ \ +({ \ + if (RW_WRITE_HELD(rwp)) { \ + spl_rw_clear_owner(rwp); \ + up_write(SEM(rwp)); \ + } else { \ + ASSERT(RW_READ_HELD(rwp)); \ + up_read(SEM(rwp)); \ + } \ +}) + +#define spl_rw_downgrade_impl(rwp) /* CSTYLED */ \ +({ \ + spl_rw_clear_owner(rwp); \ + downgrade_write(SEM(rwp)); \ +}) + #define rw_init(rwp, name, type, arg) /* CSTYLED */ \ ({ \ static struct lock_class_key __key; \ @@ -140,60 +204,60 @@ RW_READ_HELD(krwlock_t *rwp) #define rw_tryenter(rwp, rw) /* CSTYLED */ \ ({ \ - int _rc_ = 0; \ - \ spl_rw_lockdep_off_maybe(rwp); \ - switch (rw) { \ - case RW_READER: \ - _rc_ = down_read_trylock(SEM(rwp)); \ - break; \ - case RW_WRITER: \ - if ((_rc_ = down_write_trylock(SEM(rwp)))) \ - spl_rw_set_owner(rwp); \ - break; \ - default: \ - VERIFY(0); \ - } \ + int _rc_ = spl_rw_tryenter_impl(rwp, rw); \ spl_rw_lockdep_on_maybe(rwp); \ _rc_; \ }) +#define rw_tryenter_nolockdep(rwp, rw) /* CSTYLED */ \ +({ \ + spl_rw_lockdep_off(); \ + int _rc_ = spl_rw_tryenter_impl(rwp, rw); \ + spl_rw_lockdep_on(); \ + _rc_; \ +}) + #define rw_enter(rwp, rw) /* CSTYLED */ \ ({ \ spl_rw_lockdep_off_maybe(rwp); \ - switch (rw) { \ - case RW_READER: \ - down_read(SEM(rwp)); \ - break; \ - case RW_WRITER: \ - down_write(SEM(rwp)); \ - spl_rw_set_owner(rwp); \ - break; \ - default: \ - VERIFY(0); \ - } \ + spl_rw_enter_impl(rwp, rw); \ spl_rw_lockdep_on_maybe(rwp); \ }) +#define rw_enter_nolockdep(rwp, rw) /* CSTYLED */ \ +({ \ + spl_rw_lockdep_off(); \ + spl_rw_enter_impl(rwp, rw); \ + spl_rw_lockdep_on(); \ +}) + #define rw_exit(rwp) /* CSTYLED */ \ ({ \ spl_rw_lockdep_off_maybe(rwp); \ - if (RW_WRITE_HELD(rwp)) { \ - spl_rw_clear_owner(rwp); \ - up_write(SEM(rwp)); \ - } else { \ - ASSERT(RW_READ_HELD(rwp)); \ - up_read(SEM(rwp)); \ - } \ + spl_rw_exit_impl(rwp); \ spl_rw_lockdep_on_maybe(rwp); \ }) +#define rw_exit_nolockdep(rwp) /* CSTYLED */ \ +({ \ + spl_rw_lockdep_off(); \ + spl_rw_exit_impl(rwp); \ + spl_rw_lockdep_on(); \ +}) + #define rw_downgrade(rwp) /* CSTYLED */ \ ({ \ spl_rw_lockdep_off_maybe(rwp); \ - spl_rw_clear_owner(rwp); \ - downgrade_write(SEM(rwp)); \ + spl_rw_downgrade_impl(rwp); \ spl_rw_lockdep_on_maybe(rwp); \ }) +#define rw_downgrade_nolockdep(rwp) /* CSTYLED */ \ +({ \ + spl_rw_lockdep_off(); \ + spl_rw_downgrade_impl(rwp); \ + spl_rw_lockdep_on(); \ +}) + #endif /* _SPL_RWLOCK_H */ diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h index 2b3668c6086..d6f025d0942 100644 --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -95,8 +95,7 @@ typedef void arc_prune_func_t(uint64_t bytes, void *priv); extern uint_t zfs_arc_average_blocksize; extern int l2arc_exclude_special; -/* generic arc_done_func_t's which you can use */ -arc_read_done_func_t arc_bcopy_func; +/* generic arc_done_func_t which can be used */ arc_read_done_func_t arc_getbuf_func; /* generic arc_prune_func_t wrapper for callbacks */ diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index dbe712e2e73..7fbf5cee4fa 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -832,6 +832,8 @@ typedef struct arc_stats { * due to ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_metadata; + /* Number of L2ARC devices currently attached across all pools. */ + kstat_named_t arcstat_l2_ndev; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* @@ -1103,7 +1105,7 @@ extern arc_sums_t arc_sums; extern hrtime_t arc_growtime; extern boolean_t arc_warm; extern uint_t arc_grow_retry; -extern uint_t arc_no_grow_shift; +extern uint_t zfs_arc_no_grow_shift; extern uint_t arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; @@ -1134,6 +1136,7 @@ extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); extern int param_set_l2arc_dwpd_limit(ZFS_MODULE_PARAM_ARGS); +extern int param_set_arc_no_grow_shift(ZFS_MODULE_PARAM_ARGS); extern void l2arc_dwpd_bump_reset(void); /* used in zdb.c */ diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index fcef32ecf9f..8e877166ada 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -363,6 +363,7 @@ typedef enum { /* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_BOOLEAN_NA 2 +#define ZPROP_BOOLEAN_INHERIT 2 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" @@ -476,6 +477,8 @@ typedef enum { VDEV_PROP_SCHEDULER, VDEV_PROP_FDOMAIN, VDEV_PROP_FGROUP, + VDEV_PROP_ALLOC_BIAS, + VDEV_PROP_ROTATIONAL, VDEV_NUM_PROPS } vdev_prop_t; @@ -491,6 +494,16 @@ typedef enum { VDEV_SCHEDULER_OFF } vdev_scheduler_type_t; +/* + * Allocation bias for top-level vdevs (alloc_bias property). + */ +typedef enum vdev_alloc_bias { + VDEV_BIAS_NONE, + VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ + VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ + VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ +} vdev_alloc_bias_t; + /* * Dataset property functions shared between libzfs and kernel. */ @@ -919,6 +932,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" +#define ZPOOL_CONFIG_VDEV_ROTATIONAL "rotational" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index faeb96fe965..44a4d4ddf75 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -330,7 +330,7 @@ struct metaslab_group { * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core - * free tree is zfs_condense_pct/100 times the size of the minimal + * free tree is zfs_metaslab_condense_pct/100 times the size of the minimal * on-disk representation, we rewrite it in its minimized form. If a * metaslab needs to condense then we must set the ms_condensing flag to * ensure that allocations are not performed on the metaslab that is diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index 634594aca12..84e78f5dbc8 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -155,14 +155,6 @@ struct vdev_queue { kmutex_t vq_lock; }; -typedef enum vdev_alloc_bias { - VDEV_BIAS_NONE, - VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ - VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ - VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ -} vdev_alloc_bias_t; - - /* * On-disk indirect vdev state. * @@ -600,6 +592,7 @@ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_dispatch(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); diff --git a/sys/contrib/openzfs/include/sys/zap.h b/sys/contrib/openzfs/include/sys/zap.h index 66fbc1385d2..50e7079e014 100644 --- a/sys/contrib/openzfs/include/sys/zap.h +++ b/sys/contrib/openzfs/include/sys/zap.h @@ -24,6 +24,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2026, TrueNAS. */ #ifndef _SYS_ZAP_H @@ -121,13 +122,13 @@ typedef enum zap_flags { /* * Create a new zapobj with no attributes and return its object number. */ -uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, +uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, +uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); -uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, +uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, +uint64_t zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, @@ -137,11 +138,22 @@ uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); + +/* + * Create a zap object and return a pointer to the newly allocated dnode via + * the allocated_dnode argument. The returned dnode will be held and the + * caller is responsible for releasing the hold by calling dnode_rele(). + */ uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx); +/* + * Create a new zapobj with no attributes, and add an entry to an existing + * zapobj with the given name as key and the object number of the new zapobj as + * the value. Returns the object number of the new zapobj. + */ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx); uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, @@ -157,20 +169,21 @@ void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, * Create a new zapobj with no attributes from the given (unallocated) * object number. */ -int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, +int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, +int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); -int zap_create_claim_norm(objset_t *ds, uint64_t obj, +int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, +int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); /* - * The zapobj passed in must be a valid ZAP object for all of the - * following routines. + * All operations on a zapobj take either the the objset/objectid pair + * that "names" the object, or an existing dnode_t for the object. The + * zapobj passed in must be a valid ZAP object. */ /* @@ -178,7 +191,7 @@ int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, * * Frees the object number using dmu_object_free. */ -int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); +int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); /* * Manipulate attributes. @@ -207,21 +220,32 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); * fit will be transferred to 'buf'. If the entire attribute was not * transferred, the call will return EOVERFLOW. */ -int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, +int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); /* * If rn_len is nonzero, realname will be set to the name of the found * entry (which may be different from the requested name if matchtype is - * not MT_EXACT). + * not zero). * * If normalization_conflictp is not NULL, it will be set if there is * another name with the same case/unicode normalized form. */ -int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, +int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); +int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp); + +/* + * The _uint64 variants take an array of uint64_t as the key. The ZAP must + * be created with ZAP_FLAG_UINT64_KEY. + */ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, @@ -229,20 +253,31 @@ int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, uint64_t *actual_num_integers); -int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); + +/* + * Lookup the attribute with the given name. Returns ENOENT if it does not + * exist, 0 if it does. This is like zap_lookup(), but may be more efficient. + */ +int zap_contains(objset_t *os, uint64_t zapobj, const char *name); +int zap_contains_by_dnode(dnode_t *dn, const char *name); + +/* + * Prefetch the blocks within the ZAP where the given key is stored. The + * prefetch IO will occure in the background. + */ int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name); -int zap_prefetch_object(objset_t *os, uint64_t zapobj); + +/* Prefetch by uint64_t[] key. */ int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints); -int zap_lookup_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); -int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp); +/* + * Prefetch the entire ZAP object. Unlike zap_prefetch(), will block until + * the entire object is loaded into the ARC. + */ +int zap_prefetch_object(objset_t *os, uint64_t zapobj); /* * Create an attribute with the given name and value. @@ -250,13 +285,15 @@ int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ -int zap_add(objset_t *ds, uint64_t zapobj, const char *key, +int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); -int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, + +/* Add by uint64_t[] key. */ +int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, @@ -271,8 +308,12 @@ int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, * existing attribute's integer size, in which case the attribute's * integer size will be updated to the new value. */ -int zap_update(objset_t *ds, uint64_t zapobj, const char *name, +int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_by_dnode(dnode_t *dn, const char *name, int integer_size, + uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* Update by uint64_t[] key. */ int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); @@ -287,8 +328,12 @@ int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, * If the requested attribute does not exist, the call will fail and * return ENOENT. */ -int zap_length(objset_t *ds, uint64_t zapobj, const char *name, +int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); +int zap_length_by_dnode(dnode_t *dn, const char *name, + uint64_t *integer_size, uint64_t *num_integers); + +/* Attribute length by uint64_t[] key. */ int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers); int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, @@ -300,10 +345,12 @@ int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, * If the specified attribute does not exist, the call will fail and * return ENOENT. */ -int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); -int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, - matchtype_t mt, dmu_tx_t *tx); +int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); +int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx); + +/* Remove by uint64_t[] key. */ int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, @@ -313,9 +360,19 @@ int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, * Returns (in *count) the number of attributes in the specified zap * object. */ -int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); +int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count); int zap_count_by_dnode(dnode_t *dn, uint64_t *count); +/* + * Lookup an existing uint64 value, add the delta value to it, and store + * update it with the new value. If the new value is 0, removes the key + * entirely. + */ +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); +int zap_increment_by_dnode(dnode_t *dn, const char *name, int64_t delta, + dmu_tx_t *tx); + /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string @@ -324,21 +381,8 @@ int zap_count_by_dnode(dnode_t *dn, uint64_t *count); */ int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name, uint64_t namelen); - -/* - * Transfer all the entries from fromobj into intoobj. Only works on - * int_size=8 num_integers=1 values. Fails if there are any duplicated - * entries. - */ -int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); - -/* Same as zap_join, but set the values to 'value'. */ -int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - uint64_t value, dmu_tx_t *tx); - -/* Same as zap_join, but add together any duplicated entries. */ -int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - dmu_tx_t *tx); +int zap_value_search_by_dnode(dnode_t *dn, + uint64_t value, uint64_t mask, char *name, uint64_t namelen); /* * Manipulate entries where the name + value are the "same" (the name is @@ -347,8 +391,10 @@ int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); -int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx); + +int zap_add_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx); +int zap_remove_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_by_dnode(dnode_t *dn, uint64_t value); /* Here the key is an int and the value is a different int. */ int zap_add_int_key(objset_t *os, uint64_t obj, @@ -358,22 +404,19 @@ int zap_update_int_key(objset_t *os, uint64_t obj, int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); -int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, - dmu_tx_t *tx); +int zap_add_int_key_by_dnode(dnode_t *dn, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_update_int_key_by_dnode(dnode_t *dn, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key_by_dnode(dnode_t *dn, + uint64_t key, uint64_t *valuep); -struct zap; -struct zap_leaf; -typedef struct zap_cursor { - /* This structure is opaque! */ - objset_t *zc_objset; - struct zap *zc_zap; - struct zap_leaf *zc_leaf; - uint64_t zc_zapobj; - uint64_t zc_serialized; - uint64_t zc_hash; - uint32_t zc_cd; - boolean_t zc_prefetch; -} zap_cursor_t; +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ typedef struct { int za_integer_length; @@ -389,9 +432,6 @@ typedef struct { char za_name[]; } zap_attribute_t; -void zap_init(void); -void zap_fini(void); - /* * Alloc and free zap_attribute_t. */ @@ -399,22 +439,52 @@ zap_attribute_t *zap_attribute_alloc(void); zap_attribute_t *zap_attribute_long_alloc(void); void zap_attribute_free(zap_attribute_t *attrp); -/* - * The interface for listing all the attributes of a zapobj can be - * thought of as cursor moving down a list of the attributes one by - * one. The cookie returned by the zap_cursor_serialize routine is - * persistent across system calls (and across reboot, even). - */ +struct zap; +struct zap_leaf; + +typedef struct zap_cursor { + /* This structure is opaque! */ + struct zap *zc_zap; + struct zap_leaf *zc_leaf; + uint64_t zc_hash; + uint32_t zc_cd; + boolean_t zc_prefetch; + /* + * Legacy fields to main source compat with Lustre, which accesses + * them directly. Not to be used in new code! + */ + objset_t *zc_objset; + uint64_t zc_zapobj; +} zap_cursor_t; /* - * Initialize a zap cursor, pointing to the "first" attribute of the - * zapobj. You must _fini the cursor when you are done with it. + * Initialize a zap cursor, pointing to the "first" attribute of the zapobj. + * The entire zapobj will be prefetched. You must call zap_cursor_fini the + * cursor when you are done with it. */ -void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); -void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, - uint64_t zapobj); +int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); +int zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn); void zap_cursor_fini(zap_cursor_t *zc); +/* + * Initialize a cursor at the beginning, but request that we not prefetch + * the entire ZAP object. + */ +int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, + uint64_t zapobj); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, + uint64_t zapobj, uint64_t serialized); +int zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn, + uint64_t serialized); + /* * Get the attribute currently pointed to by the cursor. Returns * ENOENT if at the end of the attributes. @@ -435,17 +505,6 @@ void zap_cursor_advance(zap_cursor_t *zc); */ uint64_t zap_cursor_serialize(zap_cursor_t *zc); -/* - * Initialize a zap cursor pointing to the position recorded by - * zap_cursor_serialize (in the "serialized" argument). You can also - * use a "serialized" argument of 0 to start at the beginning of the - * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to - * zap_cursor_init(...).) - */ -void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, - uint64_t zapobj, uint64_t serialized); - - #define ZAP_HISTOGRAM_SIZE 10 typedef struct zap_stats { @@ -535,7 +594,12 @@ typedef struct zap_stats { * statistics. This interface shouldn't be relied on unless you really * know what you're doing. */ -int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); +int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs); +int zap_get_stats_by_dnode(dnode_t *dn, zap_stats_t *zs); + +/* ZAP subsystem setup/teardown */ +void zap_init(void); +void zap_fini(void); #ifdef __cplusplus } diff --git a/sys/contrib/openzfs/include/sys/zap_impl.h b/sys/contrib/openzfs/include/sys/zap_impl.h index d010c3c305c..ea8963f550f 100644 --- a/sys/contrib/openzfs/include/sys/zap_impl.h +++ b/sys/contrib/openzfs/include/sys/zap_impl.h @@ -26,6 +26,7 @@ * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2026, TrueNAS. */ #ifndef _SYS_ZAP_IMPL_H @@ -33,7 +34,6 @@ #include #include -#include #ifdef __cplusplus extern "C" { @@ -62,8 +62,9 @@ typedef struct mzap_phys { uint64_t mz_salt; uint64_t mz_normflags; uint64_t mz_pad[5]; - mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ + mzap_ent_phys_t mz_chunk[]; } mzap_phys_t; typedef struct mzap_ent { @@ -170,6 +171,9 @@ typedef struct zap { } zap_u; } zap_t; +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + static inline zap_phys_t * zap_f_phys(zap_t *zap) { @@ -182,6 +186,10 @@ zap_m_phys(zap_t *zap) return (zap->zap_dbuf->db_data); } +/* + * zap_name_t carries the original key and whatever we've derived from it + * (normalised form, hash, etc) as we work through completing the operation. + */ typedef struct zap_name { zap_t *zn_zap; int zn_key_intlen; @@ -196,25 +204,94 @@ typedef struct zap_name { char zn_normbuf[]; } zap_name_t; -#define zap_f zap_u.zap_fat -#define zap_m zap_u.zap_micro +/* + * Allocate a zap_name_t. The longname flag ensures there is enough room to + * hold a long filename when the 'longname' pool feature is active. + */ +zap_name_t *zap_name_alloc(zap_t *zap, boolean_t longname); +/* + * Allocate a zap_name_t for the given key. zap_name_init_str() will be + * called to normalise the key and initialise the struct. + */ +zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt); + +/* + * Allocate a zap_name_t for a uint64 array key. + */ +zap_name_t *zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints); + +/* + * Free a zap_name_t. + */ +void zap_name_free(zap_name_t *zn); + +/* + * Initialise an existing zap_name_t with the normalised form of the key, + * computed according to the given matchtype. + */ +int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt); + +/* + * Compare 'matchname' with the name represented by the zap_name_t, applying + * the same normalisation method first. Returns true if the normalised forms + * match, false otherwise. + */ boolean_t zap_match(zap_name_t *zn, const char *matchname); -int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + +/* + * Compute and return the 64-bit hash for the name, according to the name + * type and hash flags. + */ +uint64_t zap_hash(zap_name_t *zn); + +/* + * Return a zap_t for the given on-disk object, locked and ready for use. + * The zap_t will be allocated and loaded from disk if its not already loaded. + */ +int zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp); -void zap_unlockdir(zap_t *zap, const void *tag); +int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp); + +/* Unlock and release a zap_t. */ +void zap_unlock(zap_t *zap, const void *tag); + +/* + * Try to upgrade a zap lock from READER to WRITER. If the upgrade is not + * possible without blocking, returns 0. If the upgrade happened, returns 1. + */ +int zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx); + +/* + * Upgrade a zap lock from READER to WRITER. If it can't be upgraded + * immediately it will block. + */ +void zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx); + +/* zap_t release function for when associated dbuf is evicted. */ void zap_evict_sync(void *dbu); -zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt); -void zap_name_free(zap_name_t *zn); + +/* Misc internal state & config. */ int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); uint64_t zap_getflags(zap_t *zap); +/* Microzap implementation. */ +zap_t *mzap_open(dmu_buf_t *db); +int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); +mzap_ent_t *mze_find(zap_name_t *zn, zfs_btree_index_t *idx); +boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash); +void mze_destroy(zap_t *zap); +boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, + mzap_ent_t *mze, zfs_btree_index_t *idx); +void mzap_addent(zap_name_t *zn, uint64_t value); +void mzap_byteswap(mzap_phys_t *buf, size_t size); uint64_t zap_get_micro_max_size(spa_t *spa); -#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) - +/* Fatzap implementation. */ void fzap_byteswap(void *buf, size_t size); int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, @@ -223,20 +300,17 @@ int fzap_lookup(zap_name_t *zn, uint64_t *actual_num_integers); void fzap_prefetch(zap_name_t *zn); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, const void *tag, dmu_tx_t *tx); -int fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, - const void *tag, dmu_tx_t *tx); + const void *val, dmu_tx_t *tx); +int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers); int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); void fzap_get_stats(zap_t *zap, zap_stats_t *zs); void zap_put_leaf(struct zap_leaf *l); - -int fzap_add_cd(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx); +int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); #ifdef __cplusplus diff --git a/sys/contrib/openzfs/include/sys/zio_impl.h b/sys/contrib/openzfs/include/sys/zio_impl.h index 42147adaf1a..62e7e27da38 100644 --- a/sys/contrib/openzfs/include/sys/zio_impl.h +++ b/sys/contrib/openzfs/include/sys/zio_impl.h @@ -139,12 +139,12 @@ enum zio_stage { ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */ - ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */ + ZIO_STAGE_DDT_READ_START = 1 << 9, /* R----- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R----- */ + ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W---- */ + ZIO_STAGE_DDT_FREE = 1 << 12, /* --F--- */ - ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */ - ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */ - ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */ + ZIO_STAGE_BRT_FREE = 1 << 13, /* --F--- */ ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */ ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */ @@ -259,8 +259,7 @@ enum zio_stage { ZIO_STAGE_DVA_FREE) #define ZIO_DDT_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_FREE_BP_INIT | \ + (ZIO_FREE_PIPELINE | \ ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_DDT_FREE) diff --git a/sys/contrib/openzfs/lib/libspl/Makefile.am b/sys/contrib/openzfs/lib/libspl/Makefile.am index 8b50c65c0e6..4b097297816 100644 --- a/sys/contrib/openzfs/lib/libspl/Makefile.am +++ b/sys/contrib/openzfs/lib/libspl/Makefile.am @@ -63,7 +63,3 @@ libspl_la_LIBADD = \ libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) - -if BUILD_FREEBSD -libspl_assert_la_LIBADD += -lpthread -endif diff --git a/sys/contrib/openzfs/lib/libzfs/Makefile.am b/sys/contrib/openzfs/lib/libzfs/Makefile.am index 450c501556e..deae3534749 100644 --- a/sys/contrib/openzfs/lib/libzfs/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs/Makefile.am @@ -76,7 +76,7 @@ libzfs_la_LIBADD = \ libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) -libzfs_la_LDFLAGS = -pthread +libzfs_la_LDFLAGS = -version-info 7:0:0 if !ASAN_ENABLED libzfs_la_LDFLAGS += -Wl,-z,defs @@ -86,8 +86,6 @@ if BUILD_FREEBSD libzfs_la_LIBADD += -lutil -lgeom endif -libzfs_la_LDFLAGS += -version-info 7:0:0 - pkgconfig_DATA += %D%/libzfs.pc dist_noinst_DATA += %D%/libzfs.abi %D%/libzfs.suppr diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index ad28c876630..3f88f2fb83d 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -2553,7 +2553,7 @@ - + @@ -2605,6 +2605,9 @@ + + + @@ -6412,7 +6415,9 @@ - + + + diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_changelist.c b/sys/contrib/openzfs/lib/libzfs/libzfs_changelist.c index eac06f8f5ab..b1a2e17cb7a 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_changelist.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_changelist.c @@ -177,6 +177,7 @@ changelist_postfix(prop_changelist_t *clp) char shareopts[ZFS_MAXPROPLEN]; boolean_t commit_smb_shares = B_FALSE; boolean_t commit_nfs_shares = B_FALSE; + int rc = 0; /* * If CL_GATHER_DONT_UNMOUNT is set, it means we don't want to (un)mount @@ -266,7 +267,7 @@ changelist_postfix(prop_changelist_t *clp) const enum sa_protocol nfs[] = {SA_PROTOCOL_NFS, SA_NO_PROTOCOL}; if (sharenfs && mounted) { - zfs_share(cn->cn_handle, nfs); + rc = zfs_share(cn->cn_handle, nfs); commit_nfs_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { zfs_unshare(cn->cn_handle, NULL, nfs); @@ -275,7 +276,7 @@ changelist_postfix(prop_changelist_t *clp) const enum sa_protocol smb[] = {SA_PROTOCOL_SMB, SA_NO_PROTOCOL}; if (sharesmb && mounted) { - zfs_share(cn->cn_handle, smb); + rc = zfs_share(cn->cn_handle, smb); commit_smb_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { zfs_unshare(cn->cn_handle, NULL, smb); @@ -291,7 +292,15 @@ changelist_postfix(prop_changelist_t *clp) *p++ = SA_NO_PROTOCOL; zfs_commit_shares(proto); - return (0); + /* + * It's possible rc != 0 since we set a mountpoint or option while + * SMB/NFS was not running. This is fine, and we should not return + * an error up the stack. + * + * At this point we only want to report mountpoint/shareops parsing + * errors. + */ + return (rc == SA_SYNTAX_ERR ? rc : 0); } /* diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c index 0b015d8bce6..f82211699f5 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c @@ -2031,12 +2031,21 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, return (0); } +/* + * Export the pool from the system. Setting force overrides the + * active-shared-spare check. The caller must unmount all datasets + * in the pool first. + */ int zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { return (zpool_export_common(zhp, force, B_FALSE, log_str)); } +/* + * Force-export the pool: bypasses the active-shared-spare check, and skips + * writing the exported-state labels and updating the cachefile. + */ int zpool_export_force(zpool_handle_t *zhp, const char *log_str) { @@ -2574,6 +2583,10 @@ xlate_init_err(int err) return (err); } +/* + * Start (or cancel/suspend/uninit) the initialize operation on every + * leaf vdev of the pool. + */ int zpool_initialize_one(zpool_handle_t *zhp, void *data) { @@ -2685,6 +2698,10 @@ zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, return (err == 0 ? 0 : -1); } +/* + * Start (or cancel/suspend/uninit) the initialize operation on the listed + * vdevs. Returns once the new state is committed. + */ int zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) @@ -2692,6 +2709,9 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE)); } +/* + * Like zpool_initialize(), but waits for each listed vdev to finish. + */ int zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) @@ -2746,6 +2766,10 @@ zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) } } +/* + * Start (or cancel/suspend) the trim operation on every leaf vdev of + * the pool. + */ int zpool_trim_one(zpool_handle_t *zhp, void *data) { @@ -3393,6 +3417,11 @@ __zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, return (ret); } +/* + * Look up a vdev in the pool by path, name, or guid. Returns the + * vdev's configuration nvlist, or NULL on no match. Also, fills + * in avail_spare, l2cache, and log if they are non-NULL. + */ nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) @@ -4637,7 +4666,10 @@ zpool_reopen_one(zpool_handle_t *zhp, void *data) return (0); } -/* call into libzfs_core to execute the sync IOCTL per pool */ +/* + * Block until every buffered write for the pool has reached the + * underlying disks. + */ int zpool_sync_one(zpool_handle_t *zhp, void *data) { @@ -4913,6 +4945,10 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) return (0); } +/* + * Format the program name and its command-line arguments into a single + * space-separated string. + */ void zfs_save_arguments(int argc, char **argv, char *string, int len) { @@ -4925,6 +4961,10 @@ zfs_save_arguments(int argc, char **argv, char *string, int len) } } +/* + * Append a message to the pool's command-history log, retrievable via + * "zpool history". + */ int zpool_log_history(libzfs_handle_t *hdl, const char *message) { @@ -5220,6 +5260,11 @@ zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, free(mntpnt); } +/* + * Translate a (dataset object id, file object id) pair into a readable + * path. If the dataset is mounted the result is an absolute filesystem + * path; otherwise it is `dataset:path`. + */ void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) @@ -5227,6 +5272,10 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE); } +/* + * Translate a (dataset object id, file object id) pair into a + * `dataset:path` string. + */ void zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) @@ -5281,6 +5330,10 @@ zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, return (error); } +/* + * Store a boot configuration map in the bootenv area of each leaf + * vdev's labels. + */ int zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap) { @@ -5294,6 +5347,9 @@ zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap) return (error); } +/* + * Read the boot configuration map from each leaf vdev's bootenv area. + */ int zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp) { @@ -5741,6 +5797,9 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, return (ENOENT); if (prop == VDEV_PROP_SIT_OUT) return (ENOENT); + /* Only valid for top-level vdevs */ + if (prop == VDEV_PROP_ALLOC_BIAS) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_share.c b/sys/contrib/openzfs/lib/libzfs/libzfs_share.c index bfac40f17de..98a09f7f331 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_share.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_share.c @@ -64,6 +64,10 @@ sa_enable_share(const char *zfsname, const char *mountpoint, { VALIDATE_PROTOCOL(protocol, SA_INVALID_PROTOCOL); + int error = sa_validate_shareopts(shareopts, protocol); + if (error != SA_OK) + return (error); + const struct sa_share_impl args = init_share(zfsname, mountpoint, shareopts); return (fstypes[protocol]->enable_share(&args)); @@ -111,6 +115,10 @@ sa_validate_shareopts(const char *options, enum sa_protocol protocol) { VALIDATE_PROTOCOL(protocol, SA_INVALID_PROTOCOL); + /* error out on invalid characters */ + if (strpbrk(options, "\a\b\f\n\r") != NULL) + return (SA_SYNTAX_ERR); + return (fstypes[protocol]->validate_shareopts(options)); } diff --git a/sys/contrib/openzfs/lib/libzfs_core/Makefile.am b/sys/contrib/openzfs/lib/libzfs_core/Makefile.am index ec7aa95aa02..751deeeb228 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs_core/Makefile.am @@ -33,7 +33,7 @@ libzfs_core_la_LIBADD = \ libzfs_core_la_LIBADD += $(LTLIBINTL) -libzfs_core_la_LDFLAGS = -pthread +libzfs_core_la_LDFLAGS = -version-info 3:0:0 if !ASAN_ENABLED libzfs_core_la_LDFLAGS += -Wl,-z,defs @@ -43,8 +43,6 @@ if BUILD_FREEBSD libzfs_core_la_LIBADD += -lutil -lgeom endif -libzfs_core_la_LDFLAGS += -version-info 3:0:0 - pkgconfig_DATA += %D%/libzfs_core.pc dist_noinst_DATA += %D%/libzfs_core.abi %D%/libzfs_core.suppr diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am index 8192553072f..22c7ceaa1ba 100644 --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -166,6 +166,8 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev_root.c \ module/zfs/vdev_trim.c \ module/zfs/zap.c \ + module/zfs/zap_fat.c \ + module/zfs/zap_impl.c \ module/zfs/zap_leaf.c \ module/zfs/zap_micro.c \ module/zfs/zcp.c \ @@ -212,7 +214,7 @@ libzpool_la_LIBADD = \ libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -lm -libzpool_la_LDFLAGS = -pthread +libzpool_la_LDFLAGS = -version-info 7:0:0 if !ASAN_ENABLED libzpool_la_LDFLAGS += -Wl,-z,defs @@ -222,8 +224,6 @@ if BUILD_FREEBSD libzpool_la_LIBADD += -lgeom endif -libzpool_la_LDFLAGS += -version-info 7:0:0 - if TARGET_CPU_POWERPC module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.$(OBJEXT) : CFLAGS += -maltivec module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index c1fe65d2ad9..9967d9af739 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -4,6 +4,7 @@ .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" Copyright (c) 2023, 2024, 2025, Klara, Inc. +.\" Copyright (c) 2026, Mateusz Piotrowski <0mp@FreeBSD.org> .\" .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except @@ -18,7 +19,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd September 15, 2025 +.Dd May 8, 2026 .Dt ZFS 4 .Os . @@ -389,6 +390,18 @@ this is or .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 . . +.It Sy metaslab_df_alloc_threshold Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64 +Minimum size which forces the dynamic allocator to change its allocation +strategy. +Once the space map cannot satisfy an allocation of this size, it switches to a +more aggressive strategy (searching by size rather than offset). +. +.It Sy metaslab_df_free_pct Ns = Ns Sy 4 Ns % Pq uint +The minimum free space, in percent, which must be available in a space map to +continue allocations in a first-fit fashion. +Once free space drops below this level, allocations switch to a best-fit +strategy. +. .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int If not searching forward (due to .Sy metaslab_df_max_search , metaslab_df_free_pct , @@ -445,6 +458,32 @@ This improves performance, especially when there are many metaslabs per vdev and the allocation can't actually be satisfied (so we would otherwise iterate all metaslabs). . +.It Sy zfs_metaslab_sm_blksz_no_log Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq int +Block size for the metaslab space maps in pools where the +.Sy log_spacemap +feature is disabled. +Multiple metaslabs are modified per transaction group, so a smaller block size +lets more, scattered I/O operations be issued. +Must be a power of 2 greater than +.Sy 4096 . +This parameter can only be set at module load time. +. +.It Sy zfs_metaslab_sm_blksz_with_log Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int +Block size for the metaslab space maps in pools where the +.Sy log_spacemap +feature is enabled. +Changes are batched in the per-pool log spacemap and flushed to each metaslab's +space map only occasionally, so a larger block size is more efficient. +Must be a power of 2 greater than +.Sy 4096 . +This parameter can only be set at module load time. +. +.It Sy zfs_metaslab_condense_pct Ns = Ns Sy 200 Ns % Pq uint +Condense an on-disk space map when its size exceeds this percentage of +the in-memory representation. +The minimum is +.Sy 100 . +. .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint When a vdev is added, target this number of metaslabs per top-level vdev. . @@ -768,9 +807,15 @@ See also which serves a similar purpose but has a higher priority if nonzero. . .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64 -Percentage of ARC dnodes to try to scan in response to demand for non-metadata -when the number of bytes consumed by dnodes exceeds -.Sy zfs_arc_dnode_limit . +Percentage used to size dnode prune requests. +The request size is the larger of two values: +.Sy zfs_arc_dnode_reduce_percent +applied to the dnode count above +.Sy zfs_arc_dnode_limit , +or +.Sy zfs_arc_dnode_reduce_percent +applied to the total dnode count +when non-evictable metadata exceeds 3/4 of the metadata target. . .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint The ARC's buffer hash table is sized based on the assumption of an average @@ -911,6 +956,19 @@ but that was not proven to be useful. Number of missing top-level vdevs which will be allowed during pool import (only in read-only mode). . +.It Sy zfs_max_missing_tvds_cachefile Ns = Ns Sy 2 Pq u64 +Number of missing top-level vdevs tolerated when importing a pool +from a cachefile, before the trusted config is read from the MOS. +A cachefile can fall out of sync with the on-disk config after a +device removal that did not rewrite the cachefile, so the default +of 2 still lets the import reach a copy of the MOS. +. +.It Sy zfs_max_missing_tvds_scan Ns = Ns Sy 0 Pq u64 +Number of missing top-level vdevs tolerated when importing a pool +by scanning device paths, before the trusted config is read from +the MOS. +Defaults to 0 because a scan should detect every present device. +. .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64 Maximum size in bytes allowed to be passed as .Sy zc_nvlist_src_size @@ -948,8 +1006,6 @@ equivalent to the greater of the number of online CPUs and If less than .Sy arc_c No >> Sy zfs_arc_no_grow_shift free memory is available, the ARC is not allowed to grow. -This parameter is -.Fx Ns -specific . . .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int The ARC size is considered to be overflowing if it exceeds the current diff --git a/sys/contrib/openzfs/man/man7/vdevprops.7 b/sys/contrib/openzfs/man/man7/vdevprops.7 index b45128dd924..b52c6d4b023 100644 --- a/sys/contrib/openzfs/man/man7/vdevprops.7 +++ b/sys/contrib/openzfs/man/man7/vdevprops.7 @@ -142,6 +142,8 @@ See .Xr zpool-attach 8 . .It Sy trim_support Indicates if a leaf device supports trim operations. +.It Sy rotational +Indicates whether the device backing this vdev uses rotating media. .El .Pp The following native properties can be used to change the behavior of a vdev. @@ -183,9 +185,12 @@ output. A text comment up to 8192 characters long .It Sy bootsize The amount of space to reserve for the EFI system partition -.It Sy failfast +.It Sy failfast Ns = Ns Sy inherit Ns | Ns Sy on Ns | Ns Sy off If this device should propagate BIO errors back to ZFS, used to disable failfast. +.Sy inherit +causes the vdev to adopt the behavior of its parent vdev, +recursively up the tree. .It Sy sit_out Only valid for .Sy RAIDZ @@ -218,6 +223,21 @@ If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . +.It Sy alloc_bias Ns = Ns Sy none Ns | Ns Sy log Ns | Ns Sy special Ns | Ns Sy dedup +Controls the allocation class for a top-level vdev. +Changes take effect after an export and import of the pool. +Changing to/from log is not implemented, since it may lead to data loss in +case of the log device failure. +Setting to +.Sy special +and +.Sy dedup +requires +.Sy feature@allocation_classes +to be enabled. +At least one top-level vdev must remain in the normal +.Pq Sy none +class. .It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off Controls how I/O requests are added to the vdev queue when reading or writing to this vdev. diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8 index f500e7e8a13..596e1d94e39 100644 --- a/sys/contrib/openzfs/man/man8/zdb.8 +++ b/sys/contrib/openzfs/man/man8/zdb.8 @@ -284,10 +284,15 @@ Decode and display block from an embedded block pointer specified by the arguments. .It Fl f , -file-layout Display the file layout of an object for the disks of a raidz vdev. +Numeric values in the disply are hexadecimal. With .Fl H , the output is in scripted mode for easy parsing, with all values -being presented as 512 byte blocks. +being presented as 512 byte blocks in decimal; with +.Fl v , +the block type (parity or data) is displayed; with +.Fl vv , +the offset into the file for each block is also printed. Only a single top-level raidz vdev is supported. .It Fl h , -history Display pool history similar to diff --git a/sys/contrib/openzfs/man/man8/zpool-attach.8 b/sys/contrib/openzfs/man/man8/zpool-attach.8 index 04996ed4fa1..8394a5efba6 100644 --- a/sys/contrib/openzfs/man/man8/zpool-attach.8 +++ b/sys/contrib/openzfs/man/man8/zpool-attach.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd November 8, 2023 +.Dd May 9, 2026 .Dt ZPOOL-ATTACH 8 .Os . @@ -132,6 +132,35 @@ Waits until has finished resilvering or expanding before returning. .El . +.Sh EXAMPLES +.\" Example 1 is example 5 from zpool.8. +.\" Make sure to update them bidirectionally +.Ss Example 1 : No Making a non-mirrored ZFS Storage Pool mirrored +The following command converts an existing single device +.Ar sda +into a mirror by attaching a second device to it, +.Ar sdb . +.Dl # Nm zpool Cm attach Ar tank Pa sda sdb +. +.Ss Example 2 : No Expanding a RAID-Z vdev with an additional disk +The following command adds +.Ar sdg +to the existing +.Ar raidz2-0 +vdev in +.Ar tank , +turning a 6-wide RAID-Z2 into a 7-wide RAID-Z2: +.Dl # Nm zpool Cm attach Ar tank raidz2-0 Pa sdg +Progress is reported by +.Nm zpool Cm status . +The operation requires the +.Sy raidz_expansion +pool feature, and +.Ar sdg +must be at least as large as the smallest existing disk in the vdev. +Old blocks keep their original data-to-parity ratio; only blocks written +after the expansion use the new ratio. +. .Sh SEE ALSO .Xr zpool-add 8 , .Xr zpool-detach 8 , diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8 index 3753139bdfe..12a11058072 100644 --- a/sys/contrib/openzfs/man/man8/zpool-events.8 +++ b/sys/contrib/openzfs/man/man8/zpool-events.8 @@ -458,12 +458,12 @@ ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---- ZIO_STAGE_NOP_WRITE:0x00000100:-W---- -ZIO_STAGE_BRT_FREE:0x00000200:--F--- +ZIO_STAGE_DDT_READ_START:0x00000200:R----- +ZIO_STAGE_DDT_READ_DONE:0x00000400:R----- +ZIO_STAGE_DDT_WRITE:0x00000800:-W---- +ZIO_STAGE_DDT_FREE:0x00001000:--F--- -ZIO_STAGE_DDT_READ_START:0x00000400:R----- -ZIO_STAGE_DDT_READ_DONE:0x00000800:R----- -ZIO_STAGE_DDT_WRITE:0x00001000:-W---- -ZIO_STAGE_DDT_FREE:0x00002000:--F--- +ZIO_STAGE_BRT_FREE:0x00002000:--F--- ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC-- ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC-- diff --git a/sys/contrib/openzfs/man/man8/zpool-iostat.8 b/sys/contrib/openzfs/man/man8/zpool-iostat.8 index 4abe0895064..16d469849ee 100644 --- a/sys/contrib/openzfs/man/man8/zpool-iostat.8 +++ b/sys/contrib/openzfs/man/man8/zpool-iostat.8 @@ -109,10 +109,7 @@ environment variable set. If a script requires the use of a privileged command, like .Xr smartctl 8 , then it's recommended you allow the user access to it in -.Pa /etc/sudoers -or add the user to the -.Pa /etc/sudoers.d/zfs -file. +.Pa /etc/sudoers . .Pp If .Fl c diff --git a/sys/contrib/openzfs/man/man8/zpool.8 b/sys/contrib/openzfs/man/man8/zpool.8 index 4b07f96bbcb..25dff473c30 100644 --- a/sys/contrib/openzfs/man/man8/zpool.8 +++ b/sys/contrib/openzfs/man/man8/zpool.8 @@ -245,6 +245,7 @@ Invalid command line options were specified. . .Sh EXAMPLES .\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8. +.\" Example 5 is shared with zpool-attach.8. .\" Examples 6, 14 are shared with zpool-add.8. .\" Examples 7, 16 are shared with zpool-list.8. .\" Examples 8 are shared with zpool-destroy.8. diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 47e739ea4d6..fa4085c84b0 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -4,9 +4,11 @@ ZFS_MODULE_CFLAGS += -std=gnu11 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes -ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@ +ZFS_MODULE_CFLAGS += @KERNEL_NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) +ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ +ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ zfs_include = @abs_top_srcdir@/include icp_include = @abs_srcdir@/icp/include zstd_include = @abs_srcdir@/zstd/include @@ -16,6 +18,12 @@ ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include src = @abs_srcdir@ obj = @abs_builddir@ else +ifeq ($(CONFIG_ZFS_DEBUG),y) +ZFS_MODULE_CFLAGS += -Werror +ZFS_MODULE_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG +else +ZFS_MODULE_CPPFLAGS += -UDEBUG -DNDEBUG +endif zfs_include = $(srctree)/include/zfs icp_include = $(src)/icp/include zstd_include = $(src)/zstd/include @@ -28,7 +36,6 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL -ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. @@ -408,6 +415,8 @@ ZFS_OBJS := \ vdev_root.o \ vdev_trim.o \ zap.o \ + zap_fat.o \ + zap_impl.o \ zap_leaf.o \ zap_micro.o \ zcp.o \ diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd index 30cf741b965..a0ddbeb9ae6 100644 --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -65,6 +65,12 @@ CFLAGS+= -DZFS_DEBUG -g CFLAGS += -DNDEBUG .endif +.for _SAN in KASAN KMSAN KUBSAN +.if defined(WITH_${_SAN}) && ${WITH_${_SAN}} == "true" +KERN_OPTS_EXTRA+= ${_SAN} +.endif +.endfor + .if defined(WITH_GCOV) && ${WITH_GCOV} == "true" CFLAGS+= -fprofile-arcs -ftest-coverage .endif @@ -345,6 +351,8 @@ SRCS+= abd.c \ vdev_root.c \ vdev_trim.c \ zap.c \ + zap_fat.c \ + zap_impl.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ @@ -475,8 +483,8 @@ CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.zap_impl.c= -Wno-cast-qual CFLAGS.zap_leaf.c= -Wno-cast-qual -CFLAGS.zap_micro.c= -Wno-cast-qual CFLAGS.zcp.c= -Wno-cast-qual CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith diff --git a/sys/contrib/openzfs/module/nvpair/nvpair.c b/sys/contrib/openzfs/module/nvpair/nvpair.c index 07ac102145e..52678bb2bad 100644 --- a/sys/contrib/openzfs/module/nvpair/nvpair.c +++ b/sys/contrib/openzfs/module/nvpair/nvpair.c @@ -135,7 +135,8 @@ #define NVP_SIZE_CALC(name_len, data_len) \ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) -static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); +static int i_get_value_size(data_type_t type, const void *data, uint_t nelem, + size_t max_size); static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data); @@ -810,8 +811,10 @@ i_validate_nvpair(nvpair_t *nvp) * verify nvp_type, nvp_value_elem, and also possibly * verify string values and get the value size. */ - size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); size1 = nvp->nvp_size - NVP_VALOFF(nvp); + size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp), + size1); + if (size2 < 0 || size1 != NV_ALIGN(size2)) return (EFAULT); @@ -1002,12 +1005,21 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) * DATA_TYPE_STRING and * DATA_TYPE_STRING_ARRAY * Is data == NULL then the size of the string(s) is excluded. + * + * If 'max_size' is non-zero, then don't look beyond 'max_size' number of + * bytes when calculating a value size. Note that 'max_size' should include + * the NULL terminator byte when calculating string size. If 'max_size' is 0, + * it is ignored. */ static int -i_get_value_size(data_type_t type, const void *data, uint_t nelem) +i_get_value_size(data_type_t type, const void *data, uint_t nelem, + size_t max_size) { uint64_t value_sz; + if (max_size == 0) + max_size = INT32_MAX; + if (i_validate_type_nelem(type, nelem) != 0) return (-1); @@ -1052,10 +1064,15 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem) break; #endif case DATA_TYPE_STRING: - if (data == NULL) + if (data == NULL) { value_sz = 0; - else - value_sz = strlen(data) + 1; + } else { + value_sz = strnlen(data, max_size); + if (value_sz >= max_size) { + return (-1); /* string not terminated */ + } + value_sz += 1; + } break; case DATA_TYPE_BOOLEAN_ARRAY: value_sz = (uint64_t)nelem * sizeof (boolean_t); @@ -1089,16 +1106,23 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem) break; case DATA_TYPE_STRING_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); - if (data != NULL) { char *const *strs = data; uint_t i; + size_t newsize; /* no alignment requirement for strings */ for (i = 0; i < nelem; i++) { if (strs[i] == NULL) return (-1); - value_sz += strlen(strs[i]) + 1; + + newsize = strnlen(strs[i], max_size); + + if (newsize == max_size) + return (-1); /* not terminated */ + + value_sz += newsize + 1; /* +1 for NULL */ + max_size -= newsize + 1; } } break; @@ -1163,7 +1187,7 @@ nvlist_add_common(nvlist_t *nvl, const char *name, * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) included. */ - if ((value_sz = i_get_value_size(type, data, nelem)) < 0) + if ((value_sz = i_get_value_size(type, data, nelem, 0)) < 0) return (EINVAL); if (i_validate_nvpair_value(type, nelem, data) != 0) @@ -1588,7 +1612,7 @@ nvpair_value_common(const nvpair_t *nvp, data_type_t type, uint_t *nelem, #endif if (data == NULL) return (EINVAL); - if ((value_sz = i_get_value_size(type, NULL, 1)) < 0) + if ((value_sz = i_get_value_size(type, NULL, 1, 0)) < 0) return (EINVAL); memcpy(data, NVP_VALUE(nvp), (size_t)value_sz); if (nelem != NULL) @@ -3019,7 +3043,8 @@ nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ - if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) + if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp), + NVP_SIZE(nvp))) < 0) return (EFAULT); if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) @@ -3333,7 +3358,7 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ - if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) + if ((value_sz = i_get_value_size(type, NULL, nelem, NVP_SIZE(nvp)) < 0)) return (EFAULT); /* if there is no data to extract then return */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c index 02a2870c02b..7cb390cab23 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -72,9 +72,6 @@ SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target, param_set_arc_free_target, 0, CTLFLAG_RW, "Desired number of free pages below which ARC triggers reclaim"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift, - param_set_arc_no_grow_shift, 0, ZMOD_RW, - "log2(fraction of ARC which must be free to allow growing)"); int64_t arc_available_memory(void) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 22498bb721e..447aa5f8300 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -292,7 +292,7 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { int err, val; - val = arc_no_grow_shift; + val = zfs_arc_no_grow_shift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); @@ -300,7 +300,7 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) if (val < 0 || val >= arc_shrink_shift) return (EINVAL); - arc_no_grow_shift = val; + zfs_arc_no_grow_shift = val; if (arg2 != 0) warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift"); @@ -541,14 +541,14 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, /* * The in-core space map representation is more compact than its on-disk form. - * The zfs_condense_pct determines how much more compact the in-core + * The zfs_metaslab_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ -extern uint_t zfs_condense_pct; +extern uint_t zfs_metaslab_condense_pct; -SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, - CTLFLAG_RWTUN, &zfs_condense_pct, 0, +SYSCTL_UINT(_vfs_zfs, OID_AUTO, metaslab_condense_pct, + CTLFLAG_RWTUN, &zfs_metaslab_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); @@ -617,18 +617,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, "Configuration cache file write, retry after failure, interval" " (seconds)"); -extern uint64_t zfs_max_missing_tvds_cachefile; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, - CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, - "Allow importing pools with missing top-level vdevs in cache file"); - -extern uint64_t zfs_max_missing_tvds_scan; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, - CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, - "Allow importing pools with missing top-level vdevs during scan"); - /* spa_misc.c */ extern int zfs_flags; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c index 9fe4042b507..6e340261980 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c @@ -188,6 +188,12 @@ spl_kvmalloc(size_t size, gfp_t lflags) return (ptr); } + /* + * vmalloc fallback. KM_VMEM may not have been requested originally if + * we've come through spl_kmem_alloc_impl(), so we need to remove + * __GFP_COMP, which is not a valid flag for vmalloc. + */ + lflags &= ~__GFP_COMP; return (spl_vmalloc(size, lflags)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index dbc9aad936b..05f4fb51b4b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -410,6 +410,24 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) return (0); } +int +param_set_arc_no_grow_shift(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (SET_ERROR(error)); + + if (val >= arc_shrink_shift) + return (-SET_ERROR(EINVAL)); + + zfs_arc_no_grow_shift = val; + + return (0); +} + int param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 66e10584ab5..7cc19fe5afb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -931,8 +931,14 @@ vdev_disk_io_rw(zio_t *zio) return (SET_ERROR(EIO)); } + vdev_t *iter = v; + while (iter != NULL && iter->vdev_failfast == ZPROP_BOOLEAN_INHERIT) + iter = iter->vdev_parent; + + boolean_t failfast = iter ? iter->vdev_failfast == 1 : + vdev_prop_default_numeric(VDEV_PROP_FAILFAST); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && - v->vdev_failfast == B_TRUE) { + failfast) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index d7b50242992..27f3bbb46f4 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1689,6 +1689,24 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs) return (0); } +/* + * Return a referenced znode at or after zp. The z_znodes_lock protects the + * list walk; the returned inode reference keeps the znode alive after the + * lock is dropped for zfs_rezget(). + */ +static znode_t * +zfs_resume_hold_next_znode(zfsvfs_t *zfsvfs, znode_t *zp) +{ + ASSERT(MUTEX_HELD(&zfsvfs->z_znodes_lock)); + + for (; zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (igrab(ZTOI(zp)) != NULL) + return (zp); + } + + return (NULL); +} + /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the @@ -1732,13 +1750,23 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode - * number. The stale inode will be be released when the - * VFS prunes the dentry holding the remaining references - * on the stale inode. + * number. The stale inode will be released when the VFS + * prunes the dentry holding the remaining references on + * the stale inode. + * + * zfs_rezget() takes the per-object znode hold lock. Pin each znode + * while holding z_znodes_lock, then drop the list lock before calling + * zfs_rezget() to preserve the normal zh_lock -> z_znodes_lock order. */ mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; - zp = list_next(&zfsvfs->z_all_znodes, zp)) { + zp = zfs_resume_hold_next_znode(zfsvfs, + list_head(&zfsvfs->z_all_znodes)); + while (zp != NULL) { + znode_t *next = zfs_resume_hold_next_znode(zfsvfs, + list_next(&zfsvfs->z_all_znodes, zp)); + + mutex_exit(&zfsvfs->z_znodes_lock); + err2 = zfs_rezget(zp); if (err2) { zpl_d_drop_aliases(ZTOI(zp)); @@ -1747,9 +1775,14 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) /* see comment in zfs_suspend_fs() */ if (zp->z_suspended) { - zfs_zrele_async(zp); zp->z_suspended = B_FALSE; + zfs_zrele_async(zp); } + + zfs_zrele_async(zp); + + mutex_enter(&zfsvfs->z_znodes_lock); + zp = next; } mutex_exit(&zfsvfs->z_znodes_lock); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index e65f8123012..d6dad70ae09 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -2434,9 +2434,13 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { + /* + * attrzp is zp's hidden xattr directory, so the second + * znode lock acquisition is nested rather than recursive. + */ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&attrzp->z_acl_lock); - mutex_enter(&attrzp->z_lock); + mutex_enter_nested(&attrzp->z_acl_lock, NESTED_SINGLE); + mutex_enter_nested(&attrzp->z_lock, NESTED_SINGLE); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); @@ -4074,18 +4078,32 @@ zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + krwlock_t *zti_lock = &zfsvfs->z_teardown_inactive_lock; uint64_t atime[2]; int error; int need_unlock = 0; + boolean_t no_lockdep = B_FALSE; /* Only read lock if we haven't already write locked, e.g. rollback */ - if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { + if (!RW_WRITE_HELD(zti_lock)) { need_unlock = 1; - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + /* + * kswapd reaches evict_inode() with fs_reclaim held. Suppress + * lockdep only for this reclaim-thread acquire/release pair. + */ + no_lockdep = current_is_reclaim_thread(); + if (no_lockdep) + rw_enter_nolockdep(zti_lock, RW_READER); + else + rw_enter(zti_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); + if (need_unlock) { + if (no_lockdep) + rw_exit_nolockdep(zti_lock); + else + rw_exit(zti_lock); + } return; } @@ -4111,8 +4129,12 @@ zfs_inactive(struct inode *ip) } zfs_zinactive(zp); - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); + if (need_unlock) { + if (no_lockdep) + rw_exit_nolockdep(zti_lock); + else + rw_exit(zti_lock); + } } /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 2cd0f17c860..d7194e4f1f7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -550,10 +550,11 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) * * Finally, all filesystems get automatic handling for the 'source' option, * that is, the "name" of the filesystem (the first column of df(1)'s output). - * However, this only happens if the handler does not otherwise handle - * the 'source' option. Since we handle _all_ options because of 'sloppy', we - * deal with this explicitly by calling into the kernel's helper for this, - * vfs_parse_fs_param_source(), which sets up fc->source. + * However, this only happens if the handler does not otherwise handle the + * 'source' option. Since we handle _all_ options because of 'sloppy', we have + * ot handle it ourselves. Normally we would call vfs_parse_fs_param_source() + * to deal with this, but that didn't appear until 5.14, and it's small enough + * that we can just handle it ourselves. * * source * @@ -565,6 +566,7 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) */ enum { + Opt_source, Opt_exec, Opt_suid, Opt_dev, Opt_atime, Opt_relatime, Opt_strictatime, Opt_saxattr, Opt_dirxattr, Opt_noxattr, @@ -574,6 +576,8 @@ enum { }; static const struct fs_parameter_spec zpl_param_spec[] = { + fsparam_string("source", Opt_source), + fsparam_flag_no("exec", Opt_exec), fsparam_flag_no("suid", Opt_suid), fsparam_flag_no("dev", Opt_dev), @@ -609,18 +613,34 @@ static const struct fs_parameter_spec zpl_param_spec[] = { {} }; +/* + * Before 5.6, fs_parse() took a struct fs_parameter_description + * which wraps the parameter specs with name and enum pointers. From 5.6, + * the description struct was removed and fs_parse() accepts the + * fs_parameter_spec directly. + */ +static int +zpl_fs_parse(struct fs_context *fc, struct fs_parameter *param, + struct fs_parse_result *result) +{ +#ifdef HAVE_FS_PARSE_TAKES_SPEC + return (fs_parse(fc, zpl_param_spec, param, result)); +#else + static const struct fs_parameter_description zpl_param_desc = { + .name = "zfs", + .specs = zpl_param_spec, + }; + return (fs_parse(fc, &zpl_param_desc, param, result)); +#endif +} + static int zpl_parse_param(struct fs_context *fc, struct fs_parameter *param) { vfs_t *vfs = fc->fs_private; - /* Handle 'source' explicitly so we don't trip on it as an unknown. */ - int opt = vfs_parse_fs_param_source(fc, param); - if (opt != -ENOPARAM) - return (opt); - struct fs_parse_result result; - opt = fs_parse(fc, zpl_param_spec, param, &result); + int opt = zpl_fs_parse(fc, param, &result); if (opt == -ENOPARAM) { /* * Convert unknowns to warnings, to work around the whole @@ -632,6 +652,16 @@ zpl_parse_param(struct fs_context *fc, struct fs_parameter *param) return (opt); switch (opt) { + case Opt_source: + if (fc->source != NULL) { + cmn_err(CE_NOTE, + "ZFS: multiple 'source' options not supported"); + return (-SET_ERROR(EINVAL)); + } + fc->source = param->string; + param->string = NULL; + break; + case Opt_exec: vfs->vfs_exec = !result.negated; vfs->vfs_do_exec = B_TRUE; @@ -794,7 +824,7 @@ zpl_parse_monolithic(struct fs_context *fc, void *data) /* Check if this is one of our options. */ struct fs_parse_result result; - int opt = fs_parse(fc, zpl_param_spec, ¶m, &result); + int opt = zpl_fs_parse(fc, ¶m, &result); if (opt >= 0) { /* * We already know this one of our options, so a @@ -874,9 +904,14 @@ zpl_get_tree(struct fs_context *fc) if (sb->s_root == NULL) { vfs_t *vfs = fc->fs_private; - /* Apply readonly flag as mount option */ - if (fc->sb_flags & SB_RDONLY) { - vfs->vfs_readonly = B_TRUE; + /* + * If SB_RDONLY was set/cleared from mount options, update + * them in the options struct so we set up the filesystem + * in the proper state. + */ + if (fc->sb_flags_mask & SB_RDONLY) { + vfs->vfs_readonly = + (fc->sb_flags & SB_RDONLY) ? B_TRUE : B_FALSE; vfs->vfs_do_readonly = B_TRUE; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index d93282db815..68050c870de 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -701,6 +701,24 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, * ZFS allows extended user attributes to be disabled administratively * by setting the 'xattr=off' property on the dataset. */ + +/* + * Concatenate prefix + name into a NUL-terminated stack buffer. + * Linux fs/xattr.c (import_xattr_name) caps the full xattr name at + * XATTR_NAME_MAX before any handler runs, so XATTR_NAME_MAX + 1 + * bytes always fit. + */ +static inline void +zpl_xattr_join_name(char *buf, size_t buflen, const char *prefix, + size_t prefix_len, const char *name, size_t name_len) +{ + ASSERT3U(prefix_len + name_len + 1, <=, buflen); + + memcpy(buf, prefix, prefix_len); + memcpy(buf + prefix_len, name, name_len); + buf[prefix_len + name_len] = '\0'; +} + static int __zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) @@ -726,9 +744,13 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, * try again without the namespace prefix for compatibility with * other platforms. */ - char *xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); + char xattr_name[XATTR_NAME_MAX + 1]; + + zpl_xattr_join_name(xattr_name, sizeof (xattr_name), + XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN, + name, strlen(name)); + error = zpl_xattr_get(ip, xattr_name, value, size); - kmem_strfree(xattr_name); if (error == -ENODATA) error = zpl_xattr_get(ip, name, value, size); @@ -758,8 +780,13 @@ __zpl_xattr_user_set(zidmap_t *user_ns, * XATTR_CREATE: fail if xattr already exists * XATTR_REPLACE: fail if xattr does not exist */ - char *prefixed_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); + char prefixed_name[XATTR_NAME_MAX + 1]; const char *clear_name, *set_name; + + zpl_xattr_join_name(prefixed_name, sizeof (prefixed_name), + XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN, + name, strlen(name)); + if (zfs_xattr_compat) { clear_name = prefixed_name; set_name = name; @@ -776,7 +803,7 @@ __zpl_xattr_user_set(zidmap_t *user_ns, * because it already exists. Stop here. */ if (error == -EEXIST) - goto out; + return (error); /* * If XATTR_REPLACE was specified and we succeeded to clear * an xattr, we don't need to replace anything when setting @@ -788,10 +815,7 @@ __zpl_xattr_user_set(zidmap_t *user_ns, /* * Set the new value with the configured name format. */ - error = zpl_xattr_set(ip, set_name, value, size, flags); -out: - kmem_strfree(prefixed_name); - return (error); + return (zpl_xattr_set(ip, set_name, value, size, flags)); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); @@ -824,17 +848,16 @@ static int __zpl_xattr_trusted_get(struct inode *ip, const char *name, void *value, size_t size) { - char *xattr_name; - int error; + char xattr_name[XATTR_NAME_MAX + 1]; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); - /* xattr_resolve_name will do this for us if this is defined */ - xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, value, size); - kmem_strfree(xattr_name); - return (error); + zpl_xattr_join_name(xattr_name, sizeof (xattr_name), + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN, + name, strlen(name)); + + return (zpl_xattr_get(ip, xattr_name, value, size)); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); @@ -844,17 +867,16 @@ __zpl_xattr_trusted_set(zidmap_t *user_ns, const void *value, size_t size, int flags) { (void) user_ns; - char *xattr_name; - int error; + char xattr_name[XATTR_NAME_MAX + 1]; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); - /* xattr_resolve_name will do this for us if this is defined */ - xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); - error = zpl_xattr_set(ip, xattr_name, value, size, flags); - kmem_strfree(xattr_name); - return (error); + zpl_xattr_join_name(xattr_name, sizeof (xattr_name), + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN, + name, strlen(name)); + + return (zpl_xattr_set(ip, xattr_name, value, size, flags)); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); @@ -889,14 +911,13 @@ static int __zpl_xattr_security_get(struct inode *ip, const char *name, void *value, size_t size) { - char *xattr_name; - int error; - /* xattr_resolve_name will do this for us if this is defined */ - xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, value, size); - kmem_strfree(xattr_name); + char xattr_name[XATTR_NAME_MAX + 1]; - return (error); + zpl_xattr_join_name(xattr_name, sizeof (xattr_name), + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN, + name, strlen(name)); + + return (zpl_xattr_get(ip, xattr_name, value, size)); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); @@ -906,14 +927,13 @@ __zpl_xattr_security_set(zidmap_t *user_ns, const void *value, size_t size, int flags) { (void) user_ns; - char *xattr_name; - int error; - /* xattr_resolve_name will do this for us if this is defined */ - xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); - error = zpl_xattr_set(ip, xattr_name, value, size, flags); - kmem_strfree(xattr_name); + char xattr_name[XATTR_NAME_MAX + 1]; - return (error); + zpl_xattr_join_name(xattr_name, sizeof (xattr_name), + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN, + name, strlen(name)); + + return (zpl_xattr_set(ip, xattr_name, value, size, flags)); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_valstr.c b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c index 0cb9f584acc..41a2313e575 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_valstr.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c @@ -238,11 +238,11 @@ _VALSTR_BITFIELD_IMPL(zio_stage, { 'E', "EN", "ENCRYPT" }, { 'C', "CG", "CHECKSUM_GENERATE" }, { 'N', "NW", "NOP_WRITE" }, - { 'B', "BF", "BRT_FREE" }, { 'd', "dS", "DDT_READ_START" }, { 'd', "dD", "DDT_READ_DONE" }, { 'd', "dW", "DDT_WRITE" }, { 'd', "dF", "DDT_FREE" }, + { 'B', "BF", "BRT_FREE" }, { 'G', "GA", "GANG_ASSEMBLE" }, { 'G', "GI", "GANG_ISSUE" }, { 'D', "DT", "DVA_THROTTLE" }, diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c index ee86fe0c717..09f5c88d8fb 100644 --- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c @@ -374,10 +374,16 @@ vdev_prop_init(void) { "on", 1}, { NULL } }; + static const zprop_index_t boolean_inherit_table[] = { + { "off", 0}, + { "on", 1}, + { "inherit", ZPROP_BOOLEAN_INHERIT}, + { NULL } + }; static const zprop_index_t boolean_na_table[] = { { "off", 0}, { "on", 1}, - { "-", 2}, /* ZPROP_BOOLEAN_NA */ + { "-", ZPROP_BOOLEAN_NA}, { NULL } }; @@ -388,6 +394,14 @@ vdev_prop_init(void) { NULL } }; + static const zprop_index_t vdev_alloc_bias_table[] = { + { "none", VDEV_BIAS_NONE }, + { "log", VDEV_BIAS_LOG }, + { "special", VDEV_BIAS_SPECIAL }, + { "dedup", VDEV_BIAS_DEDUP }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES); @@ -547,8 +561,8 @@ vdev_prop_init(void) /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, - PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "FAILFAST", boolean_table, - sfeatures); + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off | inherit", "FAILFAST", + boolean_inherit_table, sfeatures); zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events", B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SLOW_IO_EVENTS", boolean_table, sfeatures); @@ -556,6 +570,13 @@ vdev_prop_init(void) VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV, "auto | on | off", "IO_SCHEDULER", vdevschedulertype_table, sfeatures); + zprop_register_index(VDEV_PROP_ALLOC_BIAS, "alloc_bias", + VDEV_BIAS_NONE, PROP_DEFAULT, ZFS_TYPE_VDEV, + "none | log | special | dedup", "ALLOC_BIAS", + vdev_alloc_bias_table, sfeatures); + zprop_register_index(VDEV_PROP_ROTATIONAL, "rotational", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "ROTATIONAL", + boolean_table, sfeatures); /* hidden properties */ zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING, diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index c28cb9114dd..3e76884c557 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -398,14 +398,14 @@ uint_t zfs_arc_pc_percent = 0; /* * log2(fraction of ARC which must be free to allow growing). - * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * I.e. If there is less than arc_c >> zfs_arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ -uint_t arc_no_grow_shift = 5; +uint_t zfs_arc_no_grow_shift = 5; /* @@ -586,6 +586,7 @@ arc_stats_t arc_stats = { { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, + { "l2_ndev", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, @@ -4975,7 +4976,7 @@ arc_reap_cb_check(void *arg, zthr_t *zthr) */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); - } else if (free_memory < arc_c >> arc_no_grow_shift) { + } else if (free_memory < arc_c >> zfs_arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; @@ -5571,20 +5572,6 @@ arc_buf_access(arc_buf_t *buf) !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } -/* a generic arc_read_done_func_t which you can use */ -void -arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *arg) -{ - (void) zio, (void) zb, (void) bp; - - if (buf == NULL) - return; - - memcpy(arg, buf->b_data, arc_buf_size(buf)); - arc_buf_destroy(buf, arg); -} - /* a generic arc_read_done_func_t */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, @@ -7440,6 +7427,7 @@ arc_kstat_update(kstat_t *ksp, int rw) aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); + as->arcstat_l2_ndev.value.ui64 = l2arc_ndev; as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = @@ -7654,7 +7642,8 @@ arc_tuning_update(boolean_t verbose) /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; - arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); + zfs_arc_no_grow_shift = MIN(zfs_arc_no_grow_shift, + arc_shrink_shift - 1); } /* Valid range: 1 - N ms */ @@ -11683,6 +11672,7 @@ EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); +EXPORT_SYMBOL(arc_buf_destroy); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); @@ -11701,6 +11691,10 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift, + param_set_arc_no_grow_shift, param_get_uint, ZMOD_RW, + "log2(fraction of ARC which must be free to allow growing)"); + #ifdef _KERNEL ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c index 51ce8b9a084..7e699a9b425 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_log.c +++ b/sys/contrib/openzfs/module/zfs/ddt_log.c @@ -221,7 +221,7 @@ ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, - B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, + B_FALSE, dlu, &dlu->dlu_ndbp, &dlu->dlu_dbp, DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO)); dlu->dlu_tx = tx; @@ -338,7 +338,7 @@ ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) */ dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); - dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); + dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, dlu); ddt->ddt_log_active->ddl_length += dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index 4919ead3cea..654afe2f844 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -1859,7 +1859,7 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) &cookie)) != NULL) { /* * os_userused_lock protects against concurrent calls to - * zap_increment_int(). It's needed because zap_increment_int() + * zap_increment(). It's needed because zap_increment() * is not thread-safe (i.e. not atomic). */ mutex_enter(&os->os_userused_lock); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index fa18a2056bb..74874bb65d3 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -2901,16 +2901,20 @@ receive_read_record(dmu_recv_cookie_t *drc) { struct drr_object *drro = &drc->drc_rrd->header.drr_u.drr_object; - uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); + uint32_t size; void *buf = NULL; dmu_object_info_t doi; + size = DRR_OBJECT_PAYLOAD_SIZE(drro); + if (size > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + if (size != 0) - buf = kmem_zalloc(size, KM_SLEEP); + buf = vmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { - kmem_free(buf, size); + vmem_free(buf, size); return (err); } err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); @@ -2934,7 +2938,11 @@ receive_read_record(dmu_recv_cookie_t *drc) case DRR_WRITE: { struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; - int size = DRR_WRITE_PAYLOAD_SIZE(drrw); + uint64_t size = DRR_WRITE_PAYLOAD_SIZE(drrw); + + if (size > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); @@ -2951,12 +2959,18 @@ receive_read_record(dmu_recv_cookie_t *drc) { struct drr_write_embedded *drrwe = &drc->drc_rrd->header.drr_u.drr_write_embedded; - uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); + uint32_t size; + void *buf; + + size = P2ROUNDUP(drrwe->drr_psize, 8); + if (size > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + + buf = vmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { - kmem_free(buf, size); + vmem_free(buf, size); return (err); } @@ -2985,7 +2999,11 @@ receive_read_record(dmu_recv_cookie_t *drc) case DRR_SPILL: { struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; - int size = DRR_SPILL_PAYLOAD_SIZE(drrs); + uint64_t size = DRR_SPILL_PAYLOAD_SIZE(drrs); + + if (size > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); @@ -3136,7 +3154,7 @@ receive_process_record(struct receive_writer_arg *rwa, abd_free(rrd->abd); rrd->abd = NULL; } else if (rrd->payload != NULL) { - kmem_free(rrd->payload, rrd->payload_size); + vmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (0); @@ -3150,7 +3168,7 @@ receive_process_record(struct receive_writer_arg *rwa, rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { - kmem_free(rrd->payload, rrd->payload_size); + vmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } @@ -3163,7 +3181,7 @@ receive_process_record(struct receive_writer_arg *rwa, { struct drr_object *drro = &rrd->header.drr_u.drr_object; err = receive_object(rwa, drro, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); + vmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } @@ -3201,7 +3219,7 @@ receive_process_record(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; err = receive_write_embedded(rwa, drrwe, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); + vmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } @@ -3270,7 +3288,7 @@ receive_writer_thread(void *arg) rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { - kmem_free(rrd->payload, rrd->payload_size); + vmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } /* diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 4c354722e4f..d931d9432f0 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -2241,6 +2241,37 @@ setup_send_progress(struct dmu_send_params *dspp) return (dssp); } +/* + * Payloads must be multiples of 8 bytes for historical compatibility, but + * XDR-encoded nvlists are sized in multiples of 4 bytes and may need padding. + * + * Here we do the simplest possible thing and copy the data to a separate + * buffer. Not ideal in terms of performance and memory use, but most BEGIN + * nvlists are small or absent, the allocation is momentary, and we'll need + * to do this at most once per dataset. + * + * It's OK if there is extra data after a packed nvlist on the receiving + * side because packed nvlists have an internal end-of-list marker. + * + * The new buffer is allocated with kmem_alloc() and can be freed with + * fnvlist_pack_free(), like the original. + */ +static inline void +pad_packed_nvlist(char **buffer, size_t *size) +{ + size_t size_in = *size; + size_t extra_bytes = P2ROUNDUP(size_in, 8) - size_in; + if (extra_bytes != 0) { + size_t expanded_size = size_in + extra_bytes; + char *longbuf = kmem_alloc(expanded_size, KM_SLEEP); + memcpy(longbuf, *buffer, size_in); + memset(longbuf + size_in, 0, extra_bytes); + fnvlist_pack_free(*buffer, size_in); + *buffer = longbuf; + *size = expanded_size; + } +} + /* * Actually do the bulk of the work in a zfs send. * @@ -2474,7 +2505,7 @@ dmu_send_impl(struct dmu_send_params *dspp) dsl_pool_rele(dp, tag); - void *payload = NULL; + char *payload = NULL; size_t payload_len = 0; nvlist_t *nvl = fnvlist_alloc(); @@ -2548,7 +2579,9 @@ dmu_send_impl(struct dmu_send_params *dspp) } if (!nvlist_empty(nvl)) { - payload = fnvlist_pack(nvl, &payload_len); + VERIFY0(nvlist_pack(nvl, &payload, &payload_len, + NV_ENCODE_XDR, KM_SLEEP)); + pad_packed_nvlist(&payload, &payload_len); drr->drr_payloadlen = payload_len; } diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index 4ffd75ceace..b0354203d42 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -490,7 +490,7 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, } else { dmu_buf_t *db; VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus, - DB_RF_MUST_SUCCEED, FTAG, &db)); + DB_RF_MUST_SUCCEED, tag, &db)); dmu_buf_will_fill(db, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen, SPA_MINBLOCKSIZE), tx)); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c index 2253b868b53..e88de3dbdfd 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -1534,9 +1534,28 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) } /* call from syncing context when we actually write/free space for this dd */ -void -dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +static void dsl_dir_diduse_transfer_space_impl(dsl_dir_t *dd, int64_t used, + int64_t compressed, int64_t uncompressed, int64_t tonew, + dd_used_t oldtype, dd_used_t newtype, boolean_t nested, dmu_tx_t *tx); + +static void +dsl_dir_lock_enter(dsl_dir_t *dd, boolean_t nested) +{ + /* + * lockdep needs an explicit subclass when a child dd_lock + * nests an ancestor. + */ + if (nested) { + mutex_enter_nested(&dd->dd_lock, NESTED_SINGLE); + } else { + mutex_enter(&dd->dd_lock); + } +} + +static void +dsl_dir_diduse_space_impl(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, + boolean_t nested, dmu_tx_t *tx) { int64_t accounted_delta; @@ -1554,7 +1573,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); if (needlock) - mutex_enter(&dd->dd_lock); + dsl_dir_lock_enter(dd, nested); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); @@ -1582,12 +1601,20 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_transfer_space(dd->dd_parent, + dsl_dir_diduse_transfer_space_impl(dd->dd_parent, accounted_delta, compressed, uncompressed, - used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + used, DD_USED_CHILD_RSRV, DD_USED_CHILD, nested, tx); } } +void +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, + int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +{ + dsl_dir_diduse_space_impl(dd, type, used, compressed, uncompressed, + B_FALSE, tx); +} + void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) @@ -1612,10 +1639,10 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, mutex_exit(&dd->dd_lock); } -void -dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, +static void +dsl_dir_diduse_transfer_space_impl(dsl_dir_t *dd, int64_t used, int64_t compressed, int64_t uncompressed, int64_t tonew, - dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) + dd_used_t oldtype, dd_used_t newtype, boolean_t nested, dmu_tx_t *tx) { int64_t accounted_delta; @@ -1625,7 +1652,7 @@ dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, dmu_buf_will_dirty(dd->dd_dbuf, tx); - mutex_enter(&dd->dd_lock); + dsl_dir_lock_enter(dd, nested); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); @@ -1656,12 +1683,21 @@ dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_transfer_space(dd->dd_parent, + dsl_dir_diduse_transfer_space_impl(dd->dd_parent, accounted_delta, compressed, uncompressed, - used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + used, DD_USED_CHILD_RSRV, DD_USED_CHILD, nested, tx); } } +void +dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, + int64_t compressed, int64_t uncompressed, int64_t tonew, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + dsl_dir_diduse_transfer_space_impl(dd, used, compressed, + uncompressed, tonew, oldtype, newtype, B_FALSE, tx); +} + typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; @@ -1828,8 +1864,8 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, - delta, 0, 0, tx); + dsl_dir_diduse_space_impl(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, B_TRUE, tx); } mutex_exit(&dd->dd_lock); } @@ -2268,22 +2304,29 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); inode_timespec_t t; + + ASSERT(dsl_pool_sync_context(dp)); gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; - if (spa_feature_is_enabled(dp->dp_spa, - SPA_FEATURE_EXTENSIBLE_DATASET)) { - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t ddobj = dd->dd_object; - dsl_dir_zapify(dd, tx); - VERIFY0(zap_update(mos, ddobj, - DD_FIELD_SNAPSHOTS_CHANGED, - sizeof (uint64_t), - sizeof (inode_timespec_t) / sizeof (uint64_t), - &t, tx)); - } mutex_exit(&dd->dd_lock); + + if (!spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_EXTENSIBLE_DATASET)) { + return; + } + + objset_t *mos = dd->dd_pool->dp_meta_objset; + + /* + * dsl_dir_zapify() and zap_update() may dirty buffers and recurse + * into space accounting, so do not call them with dd_lock held. + */ + dsl_dir_zapify(dd, tx); + VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_SNAPSHOTS_CHANGED, + sizeof (uint64_t), + sizeof (inode_timespec_t) / sizeof (uint64_t), &t, tx)); } void diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 6f5dfac7b9d..03e13ca96cc 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -1280,6 +1280,7 @@ dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); scn->errorscrub_phys.dep_paused_flags = B_TRUE; dsl_errorscrub_sync_state(scn, tx); + zap_cursor_fini(&scn->errorscrub_cursor); spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c index d183e998456..2dee3e1da78 100644 --- a/sys/contrib/openzfs/module/zfs/gzip.c +++ b/sys/contrib/openzfs/module/zfs/gzip.c @@ -96,13 +96,17 @@ zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len, /* check if hardware accelerator can be used */ if (qat_dc_use_accel(d_len)) { if (qat_compress(QAT_DECOMPRESS, s_start, s_len, - d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS) - return (0); + d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS) { + if ((size_t)dstlen == d_len) + return (0); + } /* if hardware de-compress fail, do it again with software */ } if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK) return (-1); + if ((size_t)dstlen != d_len) + return (-1); return (0); } diff --git a/sys/contrib/openzfs/module/zfs/lz4_zfs.c b/sys/contrib/openzfs/module/zfs/lz4_zfs.c index 24ecf0763f9..7218a505f79 100644 --- a/sys/contrib/openzfs/module/zfs/lz4_zfs.c +++ b/sys/contrib/openzfs/module/zfs/lz4_zfs.c @@ -89,17 +89,24 @@ zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len, (void) n; const char *src = s_start; uint32_t bufsiz = BE_IN32(src); + int decoded; /* invalid compressed buffer size encoded at start */ if (bufsiz + sizeof (bufsiz) > s_len) return (1); /* - * Returns 0 on success (decompression function returned non-negative) - * and non-zero on failure (decompression function returned negative). + * LZ4_uncompress_unknownOutputSize returns the number of bytes decoded + * on success, or a negative value on failure. An OpenZFS block must + * expand to exactly d_len bytes */ - return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], - d_start, bufsiz, d_len) < 0); + decoded = LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], + d_start, bufsiz, d_len); + if (decoded < 0) + return (1); + if (d_len != (size_t)decoded) + return (1); + return (0); } ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress) diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 6ea3ecd74fc..2be1f281268 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -82,11 +82,11 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. - * The zfs_condense_pct determines how much more compact the in-core + * The zfs_metaslab_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ -uint_t zfs_condense_pct = 200; +uint_t zfs_metaslab_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of @@ -3826,8 +3826,8 @@ metaslab_group_preload(metaslab_group_t *mg) * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least - * zfs_condense_pct/100 times the size of the optimal representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). + * zfs_metaslab_condense_pct/100 times the size of the optimal representation + * (i.e. zfs_metaslab_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. @@ -3863,7 +3863,8 @@ metaslab_should_condense(metaslab_t *msp) uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); - return (object_size >= (optimal_size * zfs_condense_pct / 100) && + return (object_size >= + (optimal_size * zfs_metaslab_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } @@ -6442,6 +6443,14 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_alloc_threshold, U64, ZMOD_RW, + "Minimum size which forces the dynamic allocator to change its " + "allocation strategy"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_free_pct, UINT, ZMOD_RW, + "The minimum free space, in percent, to continue allocations in a " + "first-fit fashion"); + ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); @@ -6454,6 +6463,18 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, sm_blksz_no_log, INT, ZMOD_RW, + "Block size for space map in pools with log space map disabled. " + "Power of 2 greater than 4096."); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, sm_blksz_with_log, INT, ZMOD_RW, + "Block size for space map in pools with log space map enabled. " + "Power of 2 greater than 4096."); + ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, condense_pct, UINT, ZMOD_RW, + "Condense on-disk spacemap when it is more than this many percents " + "of in-memory counterpart"); diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index bd565bb7101..c6b36474b9f 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -1605,8 +1605,8 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); - mutex_enter(&hdl->sa_lock); mutex_enter(&zp->z_lock); + mutex_enter(&hdl->sa_lock); err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid, sizeof (uint64_t)); @@ -1750,8 +1750,8 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) zp->z_is_sa = B_TRUE; out: - mutex_exit(&zp->z_lock); mutex_exit(&hdl->sa_lock); + mutex_exit(&zp->z_lock); kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END); kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END); if (dxattr_obj) diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index eafd4b17620..c6ae91b8d9e 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -8333,12 +8333,20 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, newrootvd, txg, error)); /* - * log, dedup and special vdevs should not be replaced by spares. + * Spares can't replace logs */ - if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || - oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { + if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * For special and dedup vdevs a spare must have matching rotational + * characteristics. A rotating spare replacing a non-rotating vdev + * would silently degrade pool performance, so we reject the mismatch. + */ + if (newvd->vdev_isspare && + oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE && + newvd->vdev_nonrot != oldvd->vdev_nonrot) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - } /* * A dRAID spare can only replace a child of its parent dRAID vdev. @@ -11011,6 +11019,10 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } + for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) + vdev_sync_dispatch(vd, txg); + spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); @@ -11035,9 +11047,6 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - /* - * Update usable space statistics. - */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); @@ -11811,6 +11820,12 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); +ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds_cachefile, U64, ZMOD_RW, + "Allow importing pools with missing top-level vdevs in cache file"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds_scan, U64, ZMOD_RW, + "Allow importing pools with missing top-level vdevs during scan"); + ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 30639d7f4c7..821dfd6faff 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -460,6 +460,7 @@ vdev_prop_get_objid(vdev_t *vd, uint64_t *objid) } else if (vd->vdev_leaf_zap != 0) { *objid = vd->vdev_leaf_zap; } else { + *objid = 0; return (EINVAL); } @@ -474,8 +475,11 @@ vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) uint64_t objid; int err; - if (vdev_prop_get_objid(vd, &objid) != 0) - return (EINVAL); + if (vdev_prop_get_objid(vd, &objid) != 0) { + /* No ZAP: property was never set, return the default. */ + *value = vdev_prop_default_numeric(prop); + return (ENOENT); + } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); @@ -963,6 +967,20 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + /* + * Restore the last-known rotational status for leaf vdevs. vdev_open() + * will overwrite this with the hardware value when the device is + * accessible; the persisted value acts as a fallback for failed or + * missing devices so that spare selection can still match on device + * type even when the original disk is gone. + */ + if (vd->vdev_ops->vdev_op_leaf) { + uint64_t rotational = 0; + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL, + &rotational) == 0) + vd->vdev_nonrot = !rotational; + } + vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); @@ -1117,6 +1135,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) vd->vdev_autosit = vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); + if (ops == &vdev_root_ops) + vd->vdev_failfast = + vdev_prop_default_numeric(VDEV_PROP_FAILFAST); + else + vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT; /* * Add ourselves to the parent's list of children. @@ -3912,10 +3935,9 @@ vdev_load(vdev_t *vd) vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { - vd->vdev_failfast = failfast & 1; + vd->vdev_failfast = failfast; } else if (error == ENOENT) { - vd->vdev_failfast = vdev_prop_default_numeric( - VDEV_PROP_FAILFAST); + vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT; } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " @@ -4224,17 +4246,39 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +static void +metaslab_sync_done_task(void *arg) +{ + metaslab_t *msp = arg; + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_sync_done(msp, spa_syncing_txg(spa)); +} + +void +vdev_sync_dispatch(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vdev_is_concrete(vd)); + + for (metaslab_t *msp = txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)); + msp; msp = txg_list_next(&vd->vdev_ms_list, msp, TXG_CLEAN(txg))) { + (void) taskq_dispatch(spa->spa_sync_tq, + metaslab_sync_done_task, msp, TQ_SLEEP); + } +} + void vdev_sync_done(vdev_t *vd, uint64_t txg) { - metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); - while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) - != NULL) - metaslab_sync_done(msp, txg); + taskq_wait(vd->vdev_spa->spa_sync_tq); + + while (txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)) != NULL) + ; if (reassess) { metaslab_sync_reassess(vd->vdev_mg); @@ -6093,6 +6137,29 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) strval); } break; + case VDEV_PROP_ALLOC_BIAS: { + intval = fnvpair_value_uint64(elem); + ASSERT3U(intval, !=, VDEV_BIAS_LOG); + const char *bias_str = + (intval == VDEV_BIAS_SPECIAL) ? + VDEV_ALLOC_BIAS_SPECIAL : + (intval == VDEV_BIAS_DEDUP) ? + VDEV_ALLOC_BIAS_DEDUP : NULL; + if (bias_str == NULL) { + (void) zap_remove(mos, objid, + VDEV_TOP_ZAP_ALLOCATION_BIAS, tx); + } else { + VERIFY0(zap_update(mos, objid, + VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(bias_str) + 1, bias_str, tx)); + spa_activate_allocation_classes(spa, tx); + } + spa_history_log_internal(spa, "vdev set", tx, + "vdev_guid=%llu: alloc_bias=%s", + (u_longlong_t)vdev_guid, + bias_str != NULL ? bias_str : "none"); + break; + } default: /* normalize the property name */ propname = vdev_prop_to_name(prop); @@ -6207,11 +6274,14 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: - if (nvpair_value_uint64(elem, &intval) != 0) { + if (nvpair_value_uint64(elem, &intval) != 0 || + intval > ZPROP_BOOLEAN_INHERIT || + (intval == ZPROP_BOOLEAN_INHERIT && + vd->vdev_ops == &vdev_root_ops)) { error = EINVAL; break; } - vd->vdev_failfast = intval & 1; + vd->vdev_failfast = intval; break; case VDEV_PROP_SIT_OUT: /* Only expose this for a draid or raidz leaf */ @@ -6319,6 +6389,53 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_scheduler = intval; break; + case VDEV_PROP_ALLOC_BIAS: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (vd != vd->vdev_top || vd->vdev_top_zap == 0) { + error = ENOTSUP; + break; + } + /* Log vdevs are not supported: remove and re-add. */ + if (vd->vdev_islog) { + error = ENOTSUP; + break; + } + /* special/dedup needs allocation_classes feature */ + if (intval != VDEV_BIAS_NONE && + ((intval != VDEV_BIAS_SPECIAL && + intval != VDEV_BIAS_DEDUP) || + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES))) { + error = ENOTSUP; + break; + } + /* + * Disallow converting the last normal vdev to + * avoid pool suspension on failed allocations. + */ + if (intval != VDEV_BIAS_NONE && + vd->vdev_alloc_bias == VDEV_BIAS_NONE) { + vdev_t *rvd = spa->spa_root_vdev; + int normal = 0; + for (uint64_t c = 0; + c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + if (vdev_is_concrete(cvd) && + cvd->vdev_alloc_bias == + VDEV_BIAS_NONE && + !cvd->vdev_noalloc) + normal++; + } + if (normal <= 1) { + error = ENOTSUP; + break; + } + } + vd->vdev_alloc_bias = (vdev_alloc_bias_t)intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6350,7 +6467,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; - uint64_t objid; + uint64_t objid = 0; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; @@ -6369,9 +6486,15 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); - if (vdev_prop_get_objid(vd, &objid) != 0) - return (SET_ERROR(EINVAL)); - ASSERT(objid != 0); + /* + * A missing ZAP is normal for spare and L2ARC vdevs, which are + * not part of the main vdev tree and never get ZAPs allocated. + * Many properties are sourced directly from vdev_t fields and + * work fine without one; ZAP-backed properties will return their + * default values. objid is set to 0 when absent and the few + * cases that call zap_lookup directly guard against this below. + */ + (void) vdev_prop_get_objid(vd, &objid); mutex_enter(&spa->spa_props_lock); @@ -6694,18 +6817,28 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; - strval = NULL; - err = zap_lookup(mos, objid, nvpair_name(elem), - sizeof (uint64_t), 1, &intval); + if (objid != 0) { + err = zap_lookup(mos, objid, + nvpair_name(elem), + sizeof (uint64_t), 1, &intval); + } else { + err = ENOENT; + } if (err == ENOENT) { - intval = vdev_prop_default_numeric( - prop); + if (vd->vdev_ops == &vdev_root_ops) + intval = + vdev_prop_default_numeric( + prop); + else + intval = ZPROP_BOOLEAN_INHERIT; err = 0; } else if (err) { break; } - if (intval == vdev_prop_default_numeric(prop)) + if (intval == ZPROP_BOOLEAN_INHERIT || + (vd->vdev_ops == &vdev_root_ops && + intval == 1)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, @@ -6746,6 +6879,17 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, boolval, src); break; + case VDEV_PROP_ALLOC_BIAS: + if (vd == vd->vdev_top) { + vdev_prop_add_list(outnvl, propname, + NULL, vd->vdev_alloc_bias, + ZPROP_SRC_NONE); + } + continue; + case VDEV_PROP_ROTATIONAL: + vdev_prop_add_list(outnvl, propname, NULL, + !vd->vdev_nonrot, ZPROP_SRC_NONE); + continue; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: @@ -6771,6 +6915,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ + if (objid == 0) + continue; src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index b1371b0349c..e6da5c1707a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -467,6 +467,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd->vdev_top != NULL) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid); + } if (vd->vdev_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); @@ -493,6 +498,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_wholedisk); } + if (vd->vdev_ops->vdev_op_leaf) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL, + !vd->vdev_nonrot); + } + if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); @@ -502,6 +512,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (flags & VDEV_CONFIG_L2CACHE) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + if ((flags & VDEV_CONFIG_SPARE) && vd->vdev_asize != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, @@ -1392,6 +1405,7 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) VB_NVLIST); break; } + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h index 1ec4d0218bb..3c3370290c8 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h @@ -102,14 +102,14 @@ #define WVR(X) [w##X] "=w" (w##X) -#define UVR0_(REG, ...) [w##REG] "+&w" (w##REG) -#define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG) -#define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG) -#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG) -#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG) -#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG) -#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG) -#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR0_(REG, ...) [w##REG] "+w" (w##REG) +#define UVR1_(_1, REG, ...) [w##REG] "+w" (w##REG) +#define UVR2_(_1, _2, REG, ...) [w##REG] "+w" (w##REG) +#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+w" (w##REG) +#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+w" (w##REG) +#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+w" (w##REG) +#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+w" (w##REG) +#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+w" (w##REG) #define UVR0(r...) UVR0_(r) #define UVR1(r...) UVR1_(r) @@ -120,7 +120,7 @@ #define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31) #define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30) -#define UVR(X) [w##X] "+&w" (w##X) +#define UVR(X) [w##X] "+w" (w##X) #define R_01(REG1, REG2, ...) REG1, REG2 #define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c index b40d765e342..ca7598f489b 100644 --- a/sys/contrib/openzfs/module/zfs/zap.c +++ b/sys/contrib/openzfs/module/zfs/zap.c @@ -19,1074 +19,117 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2023 Alexander Stetsenko - * Copyright (c) 2023, Klara Inc. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2026, TrueNAS. */ -/* - * This file contains the top half of the zfs directory structure - * implementation. The bottom half is in zap_leaf.c. - * - * The zdir is an extendable hash data structure. There is a table of - * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are - * each a constant size and hold a variable number of directory entries. - * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. - * - * The pointer table holds a power of 2 number of pointers. - * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to - * by the pointer at index i in the table holds entries whose hash value - * has a zd_prefix_len - bit prefix - */ - -#include +#include #include #include -#include -#include -#include +#include #include #include #include -/* - * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object - * (all leaf blocks) when we start iterating over it. - * - * For zap_cursor_init(), the callers all intend to iterate through all the - * entries. There are a few cases where an error (typically i/o error) could - * cause it to bail out early. - * - * For zap_cursor_init_serialized(), there are callers that do the iteration - * outside of ZFS. Typically they would iterate over everything, but we - * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), - * zcp_snapshots_iter(), and other iterators over things in the MOS - these - * are called by /sbin/zfs and channel programs. The other example is - * zfs_readdir() which iterates over directory entries for the getdents() - * syscall. /sbin/ls iterates to the end (unless it receives a signal), but - * userland doesn't have to. - * - * Given that the ZAP entries aren't returned in a specific order, the only - * legitimate use cases for partial iteration would be: - * - * 1. Pagination: e.g. you only want to display 100 entries at a time, so you - * get the first 100 and then wait for the user to hit "next page", which - * they may never do). - * - * 2. You want to know if there are more than X entries, without relying on - * the zfs-specific implementation of the directory's st_size (which is - * the number of entries). - */ -static int zap_iterate_prefetch = B_TRUE; - -/* - * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be - * collapsed into a single block. - */ -int zap_shrink_enabled = B_TRUE; - -int fzap_default_block_shift = 14; /* 16k blocksize */ - -static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); -static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); - -void -fzap_byteswap(void *vbuf, size_t size) -{ - uint64_t block_type = *(uint64_t *)vbuf; - - if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) - zap_leaf_byteswap(vbuf, size); - else { - /* it's a ptrtbl block */ - byteswap_uint64_array(vbuf, size); - } -} - -void -fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) -{ - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - zap->zap_ismicro = FALSE; - - zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; - zap->zap_dbu.dbu_evict_func_async = NULL; - - mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); - zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; - - zap_phys_t *zp = zap_f_phys(zap); - /* - * explicitly zero it since it might be coming from an - * initialized microzap - */ - memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size); - zp->zap_block_type = ZBT_HEADER; - zp->zap_magic = ZAP_MAGIC; - - zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); - - zp->zap_freeblk = 2; /* block 1 will be the first leaf */ - zp->zap_num_leafs = 1; - zp->zap_num_entries = 0; - zp->zap_salt = zap->zap_salt; - zp->zap_normflags = zap->zap_normflags; - zp->zap_flags = flags; - - /* block 1 will be the first leaf */ - for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) - ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; - - /* - * set up block 1 - the first leaf - */ - dmu_buf_t *db; - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - 1<l_dbuf = db; - - zap_leaf_init(l, zp->zap_normflags != 0); - - kmem_free(l, sizeof (zap_leaf_t)); - dmu_buf_rele(db, FTAG); -} - -static int -zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) -{ - if (RW_WRITE_HELD(&zap->zap_rwlock)) - return (1); - if (rw_tryupgrade(&zap->zap_rwlock)) { - dmu_buf_will_dirty(zap->zap_dbuf, tx); - return (1); - } - return (0); -} - -/* - * Generic routines for dealing with the pointer & cookie tables. - */ - -static int -zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, - void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), - dmu_tx_t *tx) -{ - uint64_t newblk; - int bs = FZAP_BLOCK_SHIFT(zap); - int hepb = 1<<(bs-4); - /* hepb = half the number of entries in a block */ - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - ASSERT(tbl->zt_numblks > 0); - - if (tbl->zt_nextblk != 0) { - newblk = tbl->zt_nextblk; - } else { - newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); - tbl->zt_nextblk = newblk; - ASSERT0(tbl->zt_blks_copied); - dmu_prefetch_by_dnode(zap->zap_dnode, 0, - tbl->zt_blk << bs, tbl->zt_numblks << bs, - ZIO_PRIORITY_SYNC_READ); - } - - /* - * Copy the ptrtbl from the old to new location. - */ - - uint64_t b = tbl->zt_blks_copied; - dmu_buf_t *db_old; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - - /* first half of entries in old[b] go to new[2*b+0] */ - dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); - dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); - dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - dmu_buf_rele(db_old, FTAG); - - tbl->zt_blks_copied++; - - dprintf("copied block %llu of %llu\n", - (u_longlong_t)tbl->zt_blks_copied, - (u_longlong_t)tbl->zt_numblks); - - if (tbl->zt_blks_copied == tbl->zt_numblks) { - (void) dmu_free_range(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); - - tbl->zt_blk = newblk; - tbl->zt_numblks *= 2; - tbl->zt_shift++; - tbl->zt_nextblk = 0; - tbl->zt_blks_copied = 0; - - dprintf("finished; numblocks now %llu (%uk entries)\n", - (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10)); - } - - return (0); -} - -static int -zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, - dmu_tx_t *tx) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - - dprintf("storing %llx at index %llx\n", (u_longlong_t)val, - (u_longlong_t)idx); - - uint64_t blk = idx >> (bs-3); - uint64_t off = idx & ((1<<(bs-3))-1); - - dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - dmu_buf_will_dirty(db, tx); - - if (tbl->zt_nextblk != 0) { - uint64_t idx2 = idx * 2; - uint64_t blk2 = idx2 >> (bs-3); - uint64_t off2 = idx2 & ((1<<(bs-3))-1); - dmu_buf_t *db2; - - err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, - DMU_READ_NO_PREFETCH); - if (err != 0) { - dmu_buf_rele(db, FTAG); - return (err); - } - dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; - dmu_buf_rele(db2, FTAG); - } - - ((uint64_t *)db->db_data)[off] = val; - dmu_buf_rele(db, FTAG); - - return (0); -} - -static int -zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - uint64_t blk = idx >> (bs-3); - uint64_t off = idx & ((1<<(bs-3))-1); - - dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - *valp = ((uint64_t *)db->db_data)[off]; - dmu_buf_rele(db, FTAG); - - if (tbl->zt_nextblk != 0) { - /* - * read the nextblk for the sake of i/o error checking, - * so that zap_table_load() will catch errors for - * zap_table_store. - */ - blk = (idx*2) >> (bs-3); - - err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (tbl->zt_nextblk + blk) << bs, FTAG, &db, - DMU_READ_NO_PREFETCH); - if (err == 0) - dmu_buf_rele(db, FTAG); - } - return (err); -} - -/* - * Routines for growing the ptrtbl. - */ - -static void -zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) -{ - for (int i = 0; i < n; i++) { - uint64_t lb = src[i]; - dst[2 * i + 0] = lb; - dst[2 * i + 1] = lb; - } -} - -static int -zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) -{ - /* - * The pointer table should never use more hash bits than we - * have (otherwise we'd be using useless zero bits to index it). - * If we are within 2 bits of running out, stop growing, since - * this is already an aberrant condition. - */ - if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) - return (SET_ERROR(ENOSPC)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - /* - * We are outgrowing the "embedded" ptrtbl (the one - * stored in the header block). Give it its own entire - * block, which will double the size of the ptrtbl. - */ - ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, - ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); - - uint64_t newblk = zap_allocate_blocks(zap, 1); - dmu_buf_t *db_new; - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, - DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - dmu_buf_will_dirty(db_new, tx); - zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - dmu_buf_rele(db_new, FTAG); - - zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; - zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; - zap_f_phys(zap)->zap_ptrtbl.zt_shift++; - - ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << - (FZAP_BLOCK_SHIFT(zap)-3)); - - return (0); - } else { - return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, - zap_ptrtbl_transfer, tx)); - } -} - -static void -zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) -{ - dmu_buf_will_dirty(zap->zap_dbuf, tx); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); - ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); - zap_f_phys(zap)->zap_num_entries += delta; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); -} +/* zap_create */ static uint64_t -zap_allocate_blocks(zap_t *zap, int nblocks) +zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - uint64_t newblk = zap_f_phys(zap)->zap_freeblk; - zap_f_phys(zap)->zap_freeblk += nblocks; - return (newblk); -} + uint64_t obj; -static void -zap_leaf_evict_sync(void *dbu) -{ - zap_leaf_t *l = dbu; + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - -static zap_leaf_t * -zap_create_leaf(zap_t *zap, dmu_tx_t *tx) -{ - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - uint64_t blkid = zap_allocate_blocks(zap, 1); - dmu_buf_t *db = NULL; - - VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, - blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, - DMU_READ_NO_PREFETCH)); - - /* - * Create the leaf structure and stash it on the dbuf. If zap was - * recent shrunk or truncated, the dbuf might have been sitting in the - * cache waiting to be evicted, and so still have the old leaf attached - * to it. If so, just reuse it. - */ - zap_leaf_t *l = dmu_buf_get_user(db); - if (l == NULL) { - l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - l->l_blkid = blkid; - l->l_dbuf = db; - rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, - &l->l_dbuf); - dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + if (allocated_dnode == NULL) { + dnode_t *dn; + obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + &dn, FTAG, tx); + mzap_create_impl(dn, normflags, flags, tx); + dnode_rele(dn, FTAG); } else { - ASSERT3U(l->l_blkid, ==, blkid); - ASSERT3P(l->l_dbuf, ==, db); + obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + allocated_dnode, tag, tx); + mzap_create_impl(*allocated_dnode, normflags, flags, tx); } - rw_enter(&l->l_rwlock, RW_WRITER); - dmu_buf_will_dirty(l->l_dbuf, tx); - - zap_leaf_init(l, zap->zap_normflags != 0); - - zap_f_phys(zap)->zap_num_leafs++; - - return (l); + return (obj); } -int -fzap_count(zap_t *zap, uint64_t *count) +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - ASSERT(!zap->zap_ismicro); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ - *count = zap_f_phys(zap)->zap_num_entries; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); - return (0); + return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } -/* - * Routines for obtaining zap_leaf_t's - */ - -void -zap_put_leaf(zap_leaf_t *l) +uint64_t +zap_create_dnsize(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { - rw_exit(&l->l_rwlock); - dmu_buf_rele(l->l_dbuf, NULL); + return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, + dnodesize, tx)); } -static zap_leaf_t * -zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +uint64_t +zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - ASSERT(blkid != 0); - - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = blkid; - l->l_bs = highbit64(db->db_size) - 1; - l->l_dbuf = db; - - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); - - rw_exit(&l->l_rwlock); - if (winner != NULL) { - /* someone else set it first */ - zap_leaf_evict_sync(&l->l_dbu); - l = winner; - } - - /* - * lhr_pad was previously used for the next leaf in the leaf - * chain. There should be no chained leafs (as we have removed - * support for them). - */ - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); - - /* - * There should be more hash entries than there can be - * chunks to put in the hash table - */ - ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); - - /* The chunks should begin at the end of the hash table */ - ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) - &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); - - /* The chunks should end at the end of the block */ - ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); - - return (l); + return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, + 0, tx)); } -static int -zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, - zap_leaf_t **lp) +uint64_t +zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { - dmu_buf_t *db; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - /* - * If system crashed just after dmu_free_long_range in zfs_rmnode, we - * would be left with an empty xattr dir in delete queue. blkid=0 - * would be passed in when doing zfs_purgedir. If that's the case we - * should just return immediately. The underlying objects should - * already be freed, so this should be perfectly fine. - */ - if (blkid == 0) - return (SET_ERROR(ENOENT)); - - int bs = FZAP_BLOCK_SHIFT(zap); - int err = dmu_buf_hold_by_dnode(zap->zap_dnode, - blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - - ASSERT3U(db->db_object, ==, zap->zap_object); - ASSERT3U(db->db_offset, ==, blkid << bs); - ASSERT3U(db->db_size, ==, 1 << bs); - ASSERT(blkid != 0); - - zap_leaf_t *l = dmu_buf_get_user(db); - - if (l == NULL) - l = zap_open_leaf(blkid, db); - - rw_enter(&l->l_rwlock, lt); - /* - * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, - * causing ASSERT below to fail. - */ - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - ASSERT3U(l->l_blkid, ==, blkid); - ASSERT3P(l->l_dbuf, ==, db); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - *lp = l; - return (0); + return (zap_create_impl(os, normflags, 0, ot, 0, 0, + bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } -static int -zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - ASSERT3U(idx, <, - (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); - *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); - return (0); - } else { - return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, - idx, valp)); - } + return (zap_create_flags_dnsize(os, normflags, flags, ot, + leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); } -static int -zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +uint64_t +zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { - ASSERT(tx != NULL); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { - ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; - return (0); - } else { - return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, - idx, blk, tx)); - } + return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, + tx)); } -static int -zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, - dmu_tx_t *tx) +/* zap_crate_hold */ + +uint64_t +zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { - int bs = FZAP_BLOCK_SHIFT(zap); - int epb = bs >> 3; /* entries per block */ - int err = 0; - - ASSERT(tx != NULL); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - /* - * Check for i/o errors - */ - for (int i = 0; i < nptrs; i += epb) { - uint64_t blk; - err = zap_idx_to_blk(zap, idx + i, &blk); - if (err != 0) { - return (err); - } - } - - for (int i = 0; i < nptrs; i++) { - err = zap_set_idx_to_blk(zap, idx + i, blk, tx); - ASSERT0(err); /* we checked for i/o errors above */ - if (err != 0) - break; - } - - return (err); + return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + allocated_dnode, tag, tx)); } -#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) - -/* - * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. - * If two leaves are siblings, their ranges are adjecent and contain the same - * number of entries. In order to find out if a leaf has a sibling, we need to - * check the range corresponding to the sibling leaf. There is no need to check - * all entries in the range, we only need to check the frist and the last one. - */ -static uint64_t -check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) -{ - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); - uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; - uint64_t nptrs = (1 << pref_diff); - uint64_t first; - uint64_t last; - - ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); - - if (zap_idx_to_blk(zap, idx, &first) != 0) - return (0); - - if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) - return (0); - - if (first != last) - return (0); - return (first); -} - -static int -zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) -{ - uint64_t blk; - - ASSERT(zap->zap_dbuf == NULL || - zap_f_phys(zap) == zap->zap_dbuf->db_data); - - /* Reality check for corrupt zap objects (leaf or header). */ - if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && - zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || - zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { - return (SET_ERROR(EIO)); - } - - uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - int err = zap_idx_to_blk(zap, idx, &blk); - if (err != 0) - return (err); - err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - - ASSERT(err || - ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == - zap_leaf_phys(*lp)->l_hdr.lh_prefix); - return (err); -} - -static int -zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, - const void *tag, dmu_tx_t *tx, zap_leaf_t **lp) -{ - zap_t *zap = zn->zn_zap; - uint64_t hash = zn->zn_hash; - int err; - int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - - ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - zap_leaf_phys(l)->l_hdr.lh_prefix); - - if (zap_tryupgradedir(zap, tx) == 0 || - old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { - /* We failed to upgrade, or need to grow the pointer table */ - objset_t *os = zap->zap_objset; - uint64_t object = zap->zap_object; - - zap_put_leaf(l); - *lp = l = NULL; - zap_unlockdir(zap, tag); - err = zap_lockdir(os, object, tx, RW_WRITER, - FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return (err); - ASSERT(!zap->zap_ismicro); - - while (old_prefix_len == - zap_f_phys(zap)->zap_ptrtbl.zt_shift) { - err = zap_grow_ptrtbl(zap, tx); - if (err != 0) - return (err); - } - - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - - if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { - /* it split while our locks were down */ - *lp = l; - return (0); - } - } - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - zap_leaf_phys(l)->l_hdr.lh_prefix); - - int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - - (old_prefix_len + 1); - uint64_t sibling = - (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; - - /* check for i/o errors before doing zap_leaf_split */ - for (int i = 0; i < (1ULL << prefix_diff); i++) { - uint64_t blk; - err = zap_idx_to_blk(zap, sibling + i, &blk); - if (err != 0) - return (err); - ASSERT3U(blk, ==, l->l_blkid); - } - - zap_leaf_t *nl = zap_create_leaf(zap, tx); - zap_leaf_split(l, nl, zap->zap_normflags != 0); - - /* set sibling pointers */ - for (int i = 0; i < (1ULL << prefix_diff); i++) { - err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); - ASSERT0(err); /* we checked for i/o errors above */ - } - - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0); - - if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { - /* we want the sibling */ - zap_put_leaf(l); - *lp = nl; - } else { - zap_put_leaf(nl); - *lp = l; - } - - return (0); -} - -static void -zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, - const void *tag, dmu_tx_t *tx) -{ - zap_t *zap = zn->zn_zap; - int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && - zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); - - zap_put_leaf(l); - - if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { - /* - * We are in the middle of growing the pointer table, or - * this leaf will soon make us grow it. - */ - if (zap_tryupgradedir(zap, tx) == 0) { - objset_t *os = zap->zap_objset; - uint64_t zapobj = zap->zap_object; - - zap_unlockdir(zap, tag); - int err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return; - } - - /* could have finished growing while our locks were down */ - if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) - (void) zap_grow_ptrtbl(zap, tx); - } -} - -static int -fzap_checkname(zap_name_t *zn) -{ - uint32_t maxnamelen = zn->zn_normbuf_len; - uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen; - /* Only allow directory zap to have longname */ - if (len > maxnamelen || - (len > ZAP_MAXNAMELEN && - zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS)) - return (SET_ERROR(ENAMETOOLONG)); - return (0); -} - -static int -fzap_checksize(uint64_t integer_size, uint64_t num_integers) -{ - /* Only integer sizes supported by C */ - switch (integer_size) { - case 1: - case 2: - case 4: - case 8: - break; - default: - return (SET_ERROR(EINVAL)); - } - - if (integer_size * num_integers > ZAP_MAXVALUELEN) - return (SET_ERROR(E2BIG)); - - return (0); -} - -static int -fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) -{ - int err = fzap_checkname(zn); - if (err != 0) - return (err); - return (fzap_checksize(integer_size, num_integers)); -} - -/* - * Routines for manipulating attributes. - */ -int -fzap_lookup(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, void *buf, - char *realname, int rn_len, boolean_t *ncp, - uint64_t *actual_num_integers) -{ - zap_leaf_t *l; - zap_entry_handle_t zeh; - - int err = fzap_checkname(zn); - if (err != 0) - return (err); - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - if ((err = fzap_checksize(integer_size, num_integers)) != 0) { - zap_put_leaf(l); - return (err); - } - - err = zap_entry_read(&zeh, integer_size, num_integers, buf); - if (err == 0 && actual_num_integers != NULL) - *actual_num_integers = zeh.zeh_num_integers; - (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); - if (ncp) { - *ncp = zap_entry_normalization_conflict(&zeh, - zn, NULL, zn->zn_zap); - } - } - - zap_put_leaf(l); - return (err); -} - -int -fzap_add_cd(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - zap_t *zap = zn->zn_zap; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(!zap->zap_ismicro); - ASSERT0(fzap_check(zn, integer_size, num_integers)); - - err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - err = SET_ERROR(EEXIST); - goto out; - } - if (err != ENOENT) - goto out; - - err = zap_entry_create(l, zn, cd, - integer_size, num_integers, val, &zeh); - - if (err == 0) { - zap_increment_num_entries(zap, 1, tx); - } else if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) - goto retry; - } - -out: - if (l != NULL) { - if (err == ENOSPC) - zap_put_leaf(l); - else - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); - } - return (err); -} - -int -fzap_add(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, const void *tag, dmu_tx_t *tx) -{ - int err = fzap_check(zn, integer_size, num_integers); - if (err != 0) - return (err); - - return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_NEED_CD, tag, tx)); -} - -int -fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, - const void *tag, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - boolean_t create; - zap_entry_handle_t zeh; - zap_t *zap = zn->zn_zap; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_check(zn, integer_size, num_integers); - if (err != 0) - return (err); - - err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, zn, &zeh); - create = (err == ENOENT); - ASSERT(err == 0 || err == ENOENT); - - if (create) { - err = zap_entry_create(l, zn, ZAP_NEED_CD, - integer_size, num_integers, val, &zeh); - if (err == 0) - zap_increment_num_entries(zap, 1, tx); - } else { - err = zap_entry_update(&zeh, integer_size, num_integers, val); - } - - if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) - goto retry; - } - - if (l != NULL) { - if (err == ENOSPC) - zap_put_leaf(l); - else - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); - } - return (err); -} - -int -fzap_length(zap_name_t *zn, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err != 0) - goto out; - - if (integer_size != NULL) - *integer_size = zeh.zeh_integer_size; - if (num_integers != NULL) - *num_integers = zeh.zeh_num_integers; -out: - zap_put_leaf(l); - return (err); -} - -int -fzap_remove(zap_name_t *zn, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - zap_entry_remove(&zeh); - zap_increment_num_entries(zn->zn_zap, -1, tx); - - if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && - zap_shrink_enabled) - return (zap_shrink(zn, l, tx)); - } - zap_put_leaf(l); - return (err); -} - -void -fzap_prefetch(zap_name_t *zn) -{ - uint64_t blk; - zap_t *zap = zn->zn_zap; - - uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, - zap_f_phys(zap)->zap_ptrtbl.zt_shift); - if (zap_idx_to_blk(zap, idx, &blk) != 0) - return; - int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, - ZIO_PRIORITY_SYNC_READ); -} - -/* - * Helper functions for consumers. - */ +/* zap_create_link */ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, @@ -1109,169 +152,727 @@ zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, return (new_obj); } +/* zap_create_claim */ + int -zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, - char *name, uint64_t namelen) +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - zap_cursor_t zc; - int err; + return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, + 0, tx)); +} - if (mask == 0) - mask = -1ULL; +int +zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, + 0, ot, bonustype, bonuslen, dnodesize, tx)); +} - zap_attribute_t *za = zap_attribute_long_alloc(); - for (zap_cursor_init(&zc, os, zapobj); - (err = zap_cursor_retrieve(&zc, za)) == 0; - zap_cursor_advance(&zc)) { - if ((za->za_first_integer & mask) == (value & mask)) { - if (strlcpy(name, za->za_name, namelen) >= namelen) - err = SET_ERROR(ENAMETOOLONG); - break; +int +zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, + bonuslen, 0, tx)); +} + +int +zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, + dnodesize, tx); + if (error != 0) + return (error); + + error = dnode_hold(os, obj, FTAG, &dn); + if (error != 0) + return (error); + + mzap_create_impl(dn, normflags, 0, tx); + + dnode_rele(dn, FTAG); + + return (0); +} + +/* zap_destroy */ + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +/* zap_lookup */ + +int +zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + if (!zap->zap_ismicro) { + err = fzap_lookup(zn, integer_size, num_integers, buf, + realname, rn_len, ncp, NULL); + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (num_integers < 1) { + err = SET_ERROR(EOVERFLOW); + } else if (integer_size != 8) { + err = SET_ERROR(EINVAL); + } else { + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; + if (realname != NULL) + (void) strlcpy(realname, + MZE_PHYS(zap, mze)->mze_name, + rn_len); + if (ncp) { + *ncp = mzap_normalization_conflict(zap, + zn, mze, &idx); + } + } } } - zap_cursor_fini(&zc); - zap_attribute_free(za); + zap_name_free(zn); + zap_unlock(zap, FTAG); return (err); } int -zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) { - zap_cursor_t zc; - int err = 0; + return (zap_lookup_norm(os, zapobj, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} - zap_attribute_t *za = zap_attribute_long_alloc(); - for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, za) == 0; - (void) zap_cursor_advance(&zc)) { - if (za->za_integer_length != 8 || za->za_num_integers != 1) { - err = SET_ERROR(EINVAL); - break; - } - err = zap_add(os, intoobj, za->za_name, - 8, 1, &za->za_first_integer, tx); - if (err != 0) - break; +int +zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_lookup_uint64 */ + +int +zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, + uint64_t *actual_num_integers) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } - zap_cursor_fini(&zc); - zap_attribute_free(za); + + err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL, actual_num_integers); + zap_name_free(zn); + zap_unlock(zap, FTAG); return (err); } int -zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - uint64_t value, dmu_tx_t *tx) +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { - zap_cursor_t zc; - int err = 0; - - zap_attribute_t *za = zap_attribute_long_alloc(); - for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, za) == 0; - (void) zap_cursor_advance(&zc)) { - if (za->za_integer_length != 8 || za->za_num_integers != 1) { - err = SET_ERROR(EINVAL); - break; - } - err = zap_add(os, intoobj, za->za_name, - 8, 1, &value, tx); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - zap_attribute_free(za); + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_lookup_length_uint64_by_dnode(dn, key, key_numints, + integer_size, num_integers, buf, NULL); + dnode_rele(dn, FTAG); return (err); } int -zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, +zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_length_uint64_by_dnode(dn, key, key_numints, + integer_size, num_integers, buf, NULL)); +} + +/* zap_contains */ + +int +zap_contains_by_dnode(dnode_t *dn, const char *name) +{ + int err = zap_lookup_norm_by_dnode(dn, name, 0, + 0, NULL, 0, NULL, 0, NULL); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_contains_by_dnode(dn, name); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_prefetch */ + +static int +zap_prefetch_by_dnode(dnode_t *dn, const char *name) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); +} + +int +zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_prefetch_by_dnode(dn, name); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_prefetch_uint64 */ + +int +zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (0); +} + +int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_prefetch_uint64_by_dnode(dn, key, key_numints); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_prefetch_object */ + +int +zap_prefetch_object(objset_t *os, uint64_t zapobj) +{ + int error; + dmu_object_info_t doi; + + error = dmu_object_info(os, zapobj, &doi); + if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) + error = SET_ERROR(EINVAL); + if (error == 0) + dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); + + return (error); +} + +/* zap_add */ + +int +zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + + const uint64_t *intval = val; + zap_name_t *zn = zap_name_alloc_str(zap, key, 0); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_add(zn, integer_size, num_integers, val, tx); + } else if (integer_size != 8 || num_integers != 1 || + strlen(key) >= MZAP_NAME_LEN || + !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { + err = mzap_upgrade(&zn->zn_zap, tx, 0); + if (err == 0) { + err = fzap_add(zn, integer_size, num_integers, val, tx); + } + } else { + zfs_btree_index_t idx; + if (mze_find(zn, &idx) != NULL) { + err = SET_ERROR(EEXIST); + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_add_by_dnode(dn, key, integer_size, num_integers, val, tx); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_add_uint64 */ + +int +zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlock(zap, FTAG); + return (err); +} + +int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_add_uint64_by_dnode(dn, key, key_numints, + integer_size, num_integers, val, tx); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_update */ + +int +zap_update_by_dnode(dnode_t *dn, const char *name, int integer_size, + uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + + const uint64_t *intval = val; + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_update(zn, integer_size, num_integers, val, tx); + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + (u_longlong_t)dn->dn_object, integer_size, + (u_longlong_t)num_integers, name); + err = mzap_upgrade(&zn->zn_zap, tx, 0); + if (err == 0) { + err = fzap_update(zn, integer_size, num_integers, + val, tx); + } + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze != NULL) { + MZE_PHYS(zap, mze)->mze_value = *intval; + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_update_by_dnode(dn, name, + integer_size, num_integers, val, tx); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_update_uint64 */ + +int +zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_update(zn, integer_size, num_integers, val, tx); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { - zap_cursor_t zc; - int err = 0; + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_update_uint64_by_dnode(dn, key, key_numints, + integer_size, num_integers, val, tx); + dnode_rele(dn, FTAG); + return (err); +} - zap_attribute_t *za = zap_attribute_long_alloc(); - for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, za) == 0; - (void) zap_cursor_advance(&zc)) { - uint64_t delta = 0; +/* zap_length */ - if (za->za_integer_length != 8 || za->za_num_integers != 1) { - err = SET_ERROR(EINVAL); - break; - } +int +zap_length_by_dnode(dnode_t *dn, const char *name, uint64_t *integer_size, + uint64_t *num_integers) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); - err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); - if (err != 0 && err != ENOENT) - break; - delta += za->za_first_integer; - err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); - if (err != 0) - break; + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } - zap_cursor_fini(&zc); - zap_attribute_free(za); + if (!zap->zap_ismicro) { + err = fzap_length(zn, integer_size, num_integers); + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_name_free(zn); + zap_unlock(zap, FTAG); return (err); } int -zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) { - char name[20]; + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_length_by_dnode(dn, name, integer_size, num_integers); + dnode_rele(dn, FTAG); + return (err); +} - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); - return (zap_add(os, obj, name, 8, 1, &value, tx)); +/* zap_length_uint64 */ + +int +zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); } int -zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) { - char name[20]; + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_length_uint64_by_dnode(dn, key, key_numints, + integer_size, num_integers); + dnode_rele(dn, FTAG); + return (err); +} - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); - return (zap_remove(os, obj, name, tx)); +/* zap_remove */ + +static int +zap_remove_norm_by_dnode(dnode_t *dn, const char *name, matchtype_t mt, + dmu_tx_t *tx) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_remove(zn, tx); + } else { + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + zap->zap_m.zap_num_entries--; + memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); + zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); + } + } + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); } int -zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); - return (zap_lookup(os, obj, name, 8, 1, &value)); + return (zap_remove_norm(os, zapobj, name, 0, tx)); } int -zap_add_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t value, dmu_tx_t *tx) +zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) { - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_add(os, obj, name, 8, 1, &value, tx)); + return (zap_remove_norm_by_dnode(dn, name, 0, tx)); } int -zap_update_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t value, dmu_tx_t *tx) +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) { - char name[20]; + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_remove_norm_by_dnode(dn, name, mt, tx); + dnode_rele(dn, FTAG); + return (err); +} - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_update(os, obj, name, 8, 1, &value, tx)); +/* zap_remove_uint64 */ + +int +zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + dmu_tx_t *tx) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlock(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlock(zap, FTAG); + return (err); } int -zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) { - char name[20]; + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_remove_uint64_by_dnode(dn, key, key_numints, tx); + dnode_rele(dn, FTAG); + return (err); +} - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_lookup(os, obj, name, 8, 1, valuep)); +/* zap_count */ + +int +zap_count_by_dnode(dnode_t *dn, uint64_t *count) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlock(zap, FTAG); + return (err); } int -zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_count_by_dnode(dn, count); + dnode_rele(dn, FTAG); + return (err); +} + +/* zap_increment */ + +int +zap_increment_by_dnode(dnode_t *dn, const char *name, int64_t delta, dmu_tx_t *tx) { uint64_t value = 0; @@ -1279,439 +880,428 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, if (delta == 0) return (0); - int err = zap_lookup(os, obj, name, 8, 1, &value); + int err = zap_lookup_by_dnode(dn, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); value += delta; if (value == 0) - err = zap_remove(os, obj, name, tx); + err = zap_remove_by_dnode(dn, name, tx); else - err = zap_update(os, obj, name, 8, 1, &value, tx); + err = zap_update_by_dnode(dn, name, 8, 1, &value, tx); return (err); } int -zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, +zap_increment(objset_t *os, uint64_t zapobj, const char *name, int64_t delta, dmu_tx_t *tx) { - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_increment(os, obj, name, delta, tx)); -} - -/* - * Routines for iterating over the attributes. - */ - -int -fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) -{ - int err; - zap_entry_handle_t zeh; - zap_leaf_t *l; - - /* retrieve the next entry at or after zc_hash/zc_cd */ - /* if no entry, return ENOENT */ - - /* - * If we are reading from the beginning, we're almost certain to - * iterate over the entire ZAP object. If there are multiple leaf - * blocks (freeblk > 2), prefetch the whole object (up to - * dmu_prefetch_max bytes), so that we read the leaf blocks - * concurrently. (Unless noprefetch was requested via - * zap_cursor_init_noprefetch()). - */ - if (zc->zc_hash == 0 && zap_iterate_prefetch && - zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, - zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), - ZIO_PRIORITY_ASYNC_READ); - } - - if (zc->zc_leaf) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - - /* - * The leaf was either shrunk or split. - */ - if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || - (ZAP_HASH_IDX(zc->zc_hash, - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - } - -again: - if (zc->zc_leaf == NULL) { - err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, - &zc->zc_leaf); - if (err != 0) - return (err); - } - l = zc->zc_leaf; - - err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); - - if (err == ENOENT) { - if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) { - zc->zc_hash = -1ULL; - zc->zc_cd = 0; - } else { - uint64_t nocare = (1ULL << - (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; - - zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; - zc->zc_cd = 0; - - if (zc->zc_hash == 0) { - zc->zc_hash = -1ULL; - } else { - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - goto again; - } - } - } - - if (err == 0) { - zc->zc_hash = zeh.zeh_hash; - zc->zc_cd = zeh.zeh_cd; - za->za_integer_length = zeh.zeh_integer_size; - za->za_num_integers = zeh.zeh_num_integers; - if (zeh.zeh_num_integers == 0) { - za->za_first_integer = 0; - } else { - err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); - ASSERT(err == 0 || err == EOVERFLOW); - } - err = zap_entry_read_name(zap, &zeh, - za->za_name_len, za->za_name); - ASSERT0(err); - - za->za_normalization_conflict = - zap_entry_normalization_conflict(&zeh, - NULL, za->za_name, zap); - } - rw_exit(&zc->zc_leaf->l_rwlock); + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_increment_by_dnode(dn, name, delta, tx); + dnode_rele(dn, FTAG); return (err); } -static void -zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +/* zap_value_search */ + +static int +zap_value_search_impl(zap_cursor_t *zc, uint64_t value, uint64_t mask, + char *name, uint64_t namelen) { - uint64_t lastblk = 0; + int err; - /* - * NB: if a leaf has more pointers than an entire ptrtbl block - * can hold, then it'll be accounted for more than once, since - * we won't have lastblk. - */ - for (int i = 0; i < len; i++) { - zap_leaf_t *l; + if (mask == 0) + mask = -1ULL; - if (tbl[i] == lastblk) - continue; - lastblk = tbl[i]; - - int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); - if (err == 0) { - zap_leaf_stats(zap, l, zs); - zap_put_leaf(l); + zap_attribute_t *za = zap_attribute_long_alloc(); + for (; (err = zap_cursor_retrieve(zc, za)) == 0; + zap_cursor_advance(zc)) { + if ((za->za_first_integer & mask) == (value & mask)) { + if (strlcpy(name, za->za_name, namelen) >= namelen) + err = SET_ERROR(ENAMETOOLONG); + break; } } + zap_cursor_fini(zc); + zap_attribute_free(za); + return (err); +} + +int +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, + char *name, uint64_t namelen) +{ + zap_cursor_t zc; + zap_cursor_init(&zc, os, zapobj); + return (zap_value_search_impl(&zc, value, mask, name, namelen)); +} + +int +zap_value_search_by_dnode(dnode_t *dn, uint64_t value, uint64_t mask, + char *name, uint64_t namelen) +{ + zap_cursor_t zc; + zap_cursor_init_by_dnode(&zc, dn); + return (zap_value_search_impl(&zc, value, mask, name, namelen)); +} + +/* zap_*_int */ + +#define FORMAT_INT_KEY(name, value) \ + char name[20]; \ + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + +int +zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, value); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} +int +zap_add_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, value); + return (zap_add_by_dnode(dn, name, 8, 1, &value, tx)); +} + +int +zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, value); + return (zap_remove(os, obj, name, tx)); +} +int +zap_remove_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, value); + return (zap_remove_by_dnode(dn, name, tx)); +} + +int +zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) +{ + FORMAT_INT_KEY(name, value); + return (zap_lookup(os, obj, name, 8, 1, &value)); +} + +int +zap_lookup_int_by_dnode(dnode_t *dn, uint64_t value) +{ + FORMAT_INT_KEY(name, value); + return (zap_lookup_by_dnode(dn, name, 8, 1, &value)); +} + +/* zap_*_int_key */ + +int +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} +int +zap_add_int_key_by_dnode(dnode_t *dn, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, key); + return (zap_add_by_dnode(dn, name, 8, 1, &value, tx)); +} + +int +zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, key); + return (zap_update(os, obj, name, 8, 1, &value, tx)); +} +int +zap_update_int_key_by_dnode(dnode_t *dn, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + FORMAT_INT_KEY(name, key); + return (zap_update_by_dnode(dn, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) +{ + FORMAT_INT_KEY(name, key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} +int +zap_lookup_int_key_by_dnode(dnode_t *dn, uint64_t key, uint64_t *valuep) +{ + FORMAT_INT_KEY(name, key); + return (zap_lookup_by_dnode(dn, name, 8, 1, valuep)); +} + +/* zap_cursor */ + +static int +zap_cursor_init_by_dnode_impl(zap_cursor_t *zc, dnode_t *dn, + uint64_t serialized, boolean_t prefetch) +{ + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + + int err = zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + zc, &zc->zc_zap); + if (err != 0) + return (err); + + zc->zc_prefetch = prefetch; + zc->zc_objset = dn->dn_objset; + zc->zc_zapobj = dn->dn_object; + + int hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = serialized << (64 - hb); + zc->zc_cd = serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; + + /* + * Drop ZAP read lock, but keep the hold, so the holds on the + * underlying dnode and header dbuf are maintained. + */ + rw_exit(&zc->zc_zap->zap_rwlock); + + return (0); +} + +static int +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, uint32_t prefetch) +{ + dnode_t *dn = NULL; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) { + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + return (err); + } + + err = zap_cursor_init_by_dnode_impl(zc, dn, serialized, prefetch); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + return (zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE)); +} + +int +zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn) +{ + return (zap_cursor_init_by_dnode_impl(zc, dn, 0, B_TRUE)); +} + +int +zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + return (zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE)); +} + +int +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + return (zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE)); +} + +int +zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn, + uint64_t serialized) +{ + return (zap_cursor_init_by_dnode_impl(zc, dn, serialized, B_TRUE)); } void -fzap_get_stats(zap_t *zap, zap_stats_t *zs) +zap_cursor_fini(zap_cursor_t *zc) { - int bs = FZAP_BLOCK_SHIFT(zap); - zs->zs_blocksize = 1ULL << bs; - - /* - * Set zap_phys_t fields - */ - zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; - zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; - zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; - zs->zs_block_type = zap_f_phys(zap)->zap_block_type; - zs->zs_magic = zap_f_phys(zap)->zap_magic; - zs->zs_salt = zap_f_phys(zap)->zap_salt; - - /* - * Set zap_ptrtbl fields - */ - zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; - zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; - zs->zs_ptrtbl_blks_copied = - zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; - zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; - zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; - zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - /* the ptrtbl is entirely in the header block. */ - zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); - } else { - dmu_prefetch_by_dnode(zap->zap_dnode, 0, - zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, - ZIO_PRIORITY_SYNC_READ); - - for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; - b++) { - dmu_buf_t *db; - int err; - - err = dmu_buf_hold_by_dnode(zap->zap_dnode, - (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, - FTAG, &db, DMU_READ_NO_PREFETCH); - if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, - 1<<(bs-3), zs); - dmu_buf_rele(db, FTAG); - } - } + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); } + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlock(zc->zc_zap, zc); + } + memset(zc, 0, sizeof (zap_cursor_t)); } -/* - * Find last allocated block and update freeblk. - */ -static void -zap_trunc(zap_t *zap) +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { - uint64_t nentries; - uint64_t lastblk; + int err; - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + if (zc->zc_zap == NULL) + /* zap_cursor_init failed, cursor is invalid */ + return (SET_ERROR(EIO)); - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { - /* External ptrtbl */ - nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); - lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + - zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + if (zc->zc_hash == -1ULL) + return (SET_ERROR(ENOENT)); + + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { - /* Embedded ptrtbl */ - nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - lastblk = 0; + zfs_btree_index_t idx; + mzap_ent_t mze_tofind; + + mze_tofind.mze_hash = zc->zc_hash >> 32; + mze_tofind.mze_cd = zc->zc_cd; + + mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, + &mze_tofind, &idx); + if (mze == NULL) { + mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, + &idx, &idx); + } + if (mze) { + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); + za->za_normalization_conflict = + mzap_normalization_conflict(zc->zc_zap, NULL, + mze, &idx); + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mzep->mze_value; + (void) strlcpy(za->za_name, mzep->mze_name, + za->za_name_len); + zc->zc_hash = (uint64_t)mze->mze_hash << 32; + zc->zc_cd = mze->mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + err = SET_ERROR(ENOENT); + } } - for (uint64_t idx = 0; idx < nentries; idx++) { - uint64_t blk; - if (zap_idx_to_blk(zap, idx, &blk) != 0) - return; - if (blk > lastblk) - lastblk = blk; - } - - ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); - - zap_f_phys(zap)->zap_freeblk = lastblk + 1; -} - -/* - * ZAP shrinking algorithm. - * - * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf - * only if it has a sibling. Sibling leaves have the same prefix length and - * their prefixes differ only by the least significant (sibling) bit. We require - * both siblings to be empty. This eliminates a need to rehash the non-empty - * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl - * entries of the removed leaf to point out to the remaining leaf. Prefix length - * of the remaining leaf is decremented. As a result, it has a new prefix and it - * might have a new sibling. So, we repeat the process. - * - * Steps: - * 1. Check if a sibling leaf (sl) exists and it is empty. - * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. - * 3. Release the sibling (sl) to derefer it again with WRITER lock. - * 4. Upgrade zapdir lock to WRITER (once). - * 5. Derefer released leaves again. - * 6. If it is needed, recheck whether both leaves are still siblings and empty. - * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of - * the remaining leaf (slbit 0). - * 8. Free disk block of the removed leaf (dmu_free_range). - * 9. Decrement prefix_len of the remaining leaf. - * 10. Repeat the steps. - */ -static int -zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) -{ - zap_t *zap = zn->zn_zap; - int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - uint64_t hash = zn->zn_hash; - uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; - uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - boolean_t trunc = B_FALSE; - int err = 0; - - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); - ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); - - boolean_t writer = B_FALSE; - - /* - * To avoid deadlock always deref leaves in the same order - - * sibling 0 first, then sibling 1. - */ - while (prefix_len) { - zap_leaf_t *sl; - int64_t prefix_diff = zt_shift - prefix_len; - uint64_t sl_prefix = prefix ^ 1; - uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); - int slbit = prefix & 1; - - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); - - /* - * Check if there is a sibling by reading ptrtbl ptrs. - */ - if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) - break; - - /* - * sibling 1, unlock it - we haven't yet dereferenced sibling 0. - */ - if (slbit == 1) { - zap_put_leaf(l); - l = NULL; - } - - /* - * Dereference sibling leaf and check if it is empty. - */ - if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, - &sl)) != 0) - break; - - ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); - - /* - * Check if we have a sibling and it is empty. - */ - if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || - zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { - zap_put_leaf(sl); - break; - } - - zap_put_leaf(sl); - - /* - * If there two empty sibling, we have work to do, so - * we need to lock ZAP ptrtbl as WRITER. - */ - if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { - /* We failed to upgrade */ - if (l != NULL) { - zap_put_leaf(l); - l = NULL; - } - - /* - * Usually, the right way to upgrade from a READER lock - * to a WRITER lock is to call zap_unlockdir() and - * zap_lockdir(), but we do not have a tag. Instead, - * we do it in more sophisticated way. - */ - rw_exit(&zap->zap_rwlock); - rw_enter(&zap->zap_rwlock, RW_WRITER); - dmu_buf_will_dirty(zap->zap_dbuf, tx); - - zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - writer = B_TRUE; - } - - /* - * Here we have WRITER lock for ptrtbl. - * Now, we need a WRITER lock for both siblings leaves. - * Also, we have to recheck if the leaves are still siblings - * and still empty. - */ - if (l == NULL) { - /* sibling 0 */ - if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), - tx, RW_WRITER, &l)) != 0) - break; - - /* - * The leaf isn't empty anymore or - * it was shrunk/split while our locks were down. - */ - if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || - zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) - break; - } - - /* sibling 1 */ - if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, - RW_WRITER, &sl)) != 0) - break; - - /* - * The leaf isn't empty anymore or - * it was shrunk/split while our locks were down. - */ - if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || - zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { - zap_put_leaf(sl); - break; - } - - /* If we have gotten here, we have a leaf to collapse */ - uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; - uint64_t nptrs = (1ULL << prefix_diff); - uint64_t sl_blkid = sl->l_blkid; - - /* - * Set ptrtbl entries to point out to the slibling 0 blkid - */ - if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, - tx)) != 0) { - zap_put_leaf(sl); - break; - } - - /* - * Free sibling 1 disk block. - */ - int bs = FZAP_BLOCK_SHIFT(zap); - if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) - trunc = B_TRUE; - - (void) dmu_free_range(zap->zap_objset, zap->zap_object, - sl_blkid << bs, 1 << bs, tx); - zap_put_leaf(sl); - - zap_f_phys(zap)->zap_num_leafs--; - - /* - * Update prefix and prefix_len. - */ - zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; - zap_leaf_phys(l)->l_hdr.lh_prefix_len--; - - prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; - prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - } - - if (trunc) - zap_trunc(zap); - - if (l != NULL) - zap_put_leaf(l); - + rw_exit(&zc->zc_zap->zap_rwlock); return (err); } -ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, - "When iterating ZAP object, prefetch it"); +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; +} -ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, - "Enable ZAP shrinking"); +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_zap == NULL || zc->zc_hash == -1ULL) + return (-1ULL); + + ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap))); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); +} + +/* zap_get_stats */ + +int +zap_get_stats_by_dnode(dnode_t *dn, zap_stats_t *zs) +{ + zap_t *zap; + int err = + zap_lock_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + memset(zs, 0, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlock(zap, FTAG); + return (0); +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + dnode_t *dn; + int err = dnode_hold(os, zapobj, FTAG, &dn); + if (err != 0) + return (err); + err = zap_get_stats_by_dnode(dn, zs); + dnode_rele(dn, FTAG); + return (err); +} + +EXPORT_SYMBOL(zap_create); +EXPORT_SYMBOL(zap_create_dnsize); +EXPORT_SYMBOL(zap_create_norm); +EXPORT_SYMBOL(zap_create_norm_dnsize); +EXPORT_SYMBOL(zap_create_flags); +EXPORT_SYMBOL(zap_create_flags_dnsize); +EXPORT_SYMBOL(zap_create_claim); +EXPORT_SYMBOL(zap_create_claim_norm); +EXPORT_SYMBOL(zap_create_claim_norm_dnsize); +EXPORT_SYMBOL(zap_create_hold); +EXPORT_SYMBOL(zap_destroy); +EXPORT_SYMBOL(zap_lookup); +EXPORT_SYMBOL(zap_lookup_by_dnode); +EXPORT_SYMBOL(zap_lookup_norm); +EXPORT_SYMBOL(zap_lookup_uint64); +EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode); +EXPORT_SYMBOL(zap_contains); +EXPORT_SYMBOL(zap_prefetch); +EXPORT_SYMBOL(zap_prefetch_uint64); +EXPORT_SYMBOL(zap_prefetch_object); +EXPORT_SYMBOL(zap_add); +EXPORT_SYMBOL(zap_add_by_dnode); +EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_add_uint64_by_dnode); +EXPORT_SYMBOL(zap_update); +EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_update_uint64_by_dnode); +EXPORT_SYMBOL(zap_length); +EXPORT_SYMBOL(zap_length_uint64); +EXPORT_SYMBOL(zap_length_uint64_by_dnode); +EXPORT_SYMBOL(zap_remove); +EXPORT_SYMBOL(zap_remove_by_dnode); +EXPORT_SYMBOL(zap_remove_norm); +EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_remove_uint64_by_dnode); +EXPORT_SYMBOL(zap_count); +EXPORT_SYMBOL(zap_count_by_dnode); +EXPORT_SYMBOL(zap_value_search); +EXPORT_SYMBOL(zap_add_int); +EXPORT_SYMBOL(zap_remove_int); +EXPORT_SYMBOL(zap_lookup_int); +EXPORT_SYMBOL(zap_add_int_key); +EXPORT_SYMBOL(zap_lookup_int_key); +EXPORT_SYMBOL(zap_increment); +EXPORT_SYMBOL(zap_cursor_init); +EXPORT_SYMBOL(zap_cursor_fini); +EXPORT_SYMBOL(zap_cursor_retrieve); +EXPORT_SYMBOL(zap_cursor_advance); +EXPORT_SYMBOL(zap_cursor_serialize); +EXPORT_SYMBOL(zap_cursor_init_serialized); +EXPORT_SYMBOL(zap_get_stats); diff --git a/sys/contrib/openzfs/module/zfs/zap_fat.c b/sys/contrib/openzfs/module/zfs/zap_fat.c new file mode 100644 index 00000000000..7b48c6fd5a1 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/zap_fat.c @@ -0,0 +1,1458 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 Alexander Stetsenko + * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2026, TrueNAS. + */ + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object + * (all leaf blocks) when we start iterating over it. + * + * For zap_cursor_init(), the callers all intend to iterate through all the + * entries. There are a few cases where an error (typically i/o error) could + * cause it to bail out early. + * + * For zap_cursor_init_serialized(), there are callers that do the iteration + * outside of ZFS. Typically they would iterate over everything, but we + * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), + * zcp_snapshots_iter(), and other iterators over things in the MOS - these + * are called by /sbin/zfs and channel programs. The other example is + * zfs_readdir() which iterates over directory entries for the getdents() + * syscall. /sbin/ls iterates to the end (unless it receives a signal), but + * userland doesn't have to. + * + * Given that the ZAP entries aren't returned in a specific order, the only + * legitimate use cases for partial iteration would be: + * + * 1. Pagination: e.g. you only want to display 100 entries at a time, so you + * get the first 100 and then wait for the user to hit "next page", which + * they may never do). + * + * 2. You want to know if there are more than X entries, without relying on + * the zfs-specific implementation of the directory's st_size (which is + * the number of entries). + */ +static int zap_iterate_prefetch = B_TRUE; + +/* + * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be + * collapsed into a single block. + */ +int zap_shrink_enabled = B_TRUE; + +int fzap_default_block_shift = 14; /* 16k blocksize */ + +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type = *(uint64_t *)vbuf; + + if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) + zap_leaf_byteswap(vbuf, size); + else { + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, size); + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; + zap->zap_dbu.dbu_evict_func_async = NULL; + + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); + zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; + + zap_phys_t *zp = zap_f_phys(zap); + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + zp->zap_normflags = zap->zap_normflags; + zp->zap_flags = flags; + + /* block 1 will be the first leaf */ + for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) + ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; + + /* + * set up block 1 - the first leaf + */ + dmu_buf_t *db; + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + 1<l_dbuf = db; + + zap_leaf_init(l, zp->zap_normflags != 0); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db, FTAG); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static int +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t newblk; + int bs = FZAP_BLOCK_SHIFT(zap); + int hepb = 1<<(bs-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); + tbl->zt_nextblk = newblk; + ASSERT0(tbl->zt_blks_copied); + dmu_prefetch_by_dnode(zap->zap_dnode, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Copy the ptrtbl from the old to new location. + */ + + uint64_t b = tbl->zt_blks_copied; + dmu_buf_t *db_old; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + + /* first half of entries in old[b] go to new[2*b+0] */ + dmu_buf_t *db_new; + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + /* second half of entries in old[b] go to new[2*b+1] */ + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + dmu_buf_rele(db_old, FTAG); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + (u_longlong_t)tbl->zt_blks_copied, + (u_longlong_t)tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%uk entries)\n", + (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } + + return (0); +} + +static int +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", (u_longlong_t)val, + (u_longlong_t)idx); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db, tx); + + if (tbl->zt_nextblk != 0) { + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, + DMU_READ_NO_PREFETCH); + if (err != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); + } + + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); +} + +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (tbl->zt_nextblk + blk) << bs, FTAG, &db, + DMU_READ_NO_PREFETCH); + if (err == 0) + dmu_buf_rele(db, FTAG); + } + return (err); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + for (int i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2 * i + 0] = lb; + dst[2 * i + 1] = lb; + } +} + +static int +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + /* + * The pointer table should never use more hash bits than we + * have (otherwise we'd be using useless zero bits to index it). + * If we are within 2 bits of running out, stop growing, since + * this is already an aberrant condition. + */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) + return (SET_ERROR(ENOSPC)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* + * We are outgrowing the "embedded" ptrtbl (the one + * stored in the header block). Give it its own entire + * block, which will double the size of the ptrtbl. + */ + ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); + + uint64_t newblk = zap_allocate_blocks(zap, 1); + dmu_buf_t *db_new; + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, + DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + dmu_buf_rele(db_new, FTAG); + + zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; + zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; + zap_f_phys(zap)->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << + (FZAP_BLOCK_SHIFT(zap)-3)); + + return (0); + } else { + return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, + zap_ptrtbl_transfer, tx)); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); + zap_f_phys(zap)->zap_num_entries += delta; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +static uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + uint64_t newblk = zap_f_phys(zap)->zap_freeblk; + zap_f_phys(zap)->zap_freeblk += nblocks; + return (newblk); +} + +static void +zap_leaf_evict_sync(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + uint64_t blkid = zap_allocate_blocks(zap, 1); + dmu_buf_t *db = NULL; + + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, + DMU_READ_NO_PREFETCH)); + + /* + * Create the leaf structure and stash it on the dbuf. If zap was + * recent shrunk or truncated, the dbuf might have been sitting in the + * cache waiting to be evicted, and so still have the old leaf attached + * to it. If so, just reuse it. + */ + zap_leaf_t *l = dmu_buf_get_user(db); + if (l == NULL) { + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_blkid = blkid; + l->l_dbuf = db; + rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, + &l->l_dbuf); + dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + } else { + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + } + + rw_enter(&l->l_rwlock, RW_WRITER); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l, zap->zap_normflags != 0); + + zap_f_phys(zap)->zap_num_leafs++; + + return (l); +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap_f_phys(zap)->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +void +zap_put_leaf(zap_leaf_t *l) +{ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf, NULL); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + ASSERT(blkid != 0); + + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_bs = highbit64(db->db_size) - 1; + l->l_dbuf = db; + + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_evict_sync(&l->l_dbu); + l = winner; + } + + /* + * lhr_pad was previously used for the next leaf in the leaf + * chain. There should be no chained leafs (as we have removed + * support for them). + */ + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); + + /* + * There should be more hash entries than there can be + * chunks to put in the hash table + */ + ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); + + /* The chunks should begin at the end of the hash table */ + ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) + &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + + /* The chunks should end at the end of the block */ + ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - + (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); + + return (l); +} + +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) +{ + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + /* + * If system crashed just after dmu_free_long_range in zfs_rmnode, we + * would be left with an empty xattr dir in delete queue. blkid=0 + * would be passed in when doing zfs_purgedir. If that's the case we + * should just return immediately. The underlying objects should + * already be freed, so this should be perfectly fine. + */ + if (blkid == 0) + return (SET_ERROR(ENOENT)); + + int bs = FZAP_BLOCK_SHIFT(zap); + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, + blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << bs); + ASSERT3U(db->db_size, ==, 1 << bs); + ASSERT(blkid != 0); + + zap_leaf_t *l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + *lp = l; + return (0); +} + +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); + } else { + return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, valp)); + } +} + +static int +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { + ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); + } else { + return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, blk, tx)); + } +} + +static int +zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + int epb = bs >> 3; /* entries per block */ + int err = 0; + + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + /* + * Check for i/o errors + */ + for (int i = 0; i < nptrs; i += epb) { + uint64_t blk; + err = zap_idx_to_blk(zap, idx + i, &blk); + if (err != 0) { + return (err); + } + } + + for (int i = 0; i < nptrs; i++) { + err = zap_set_idx_to_blk(zap, idx + i, blk, tx); + ASSERT0(err); /* we checked for i/o errors above */ + if (err != 0) + break; + } + + return (err); +} + +#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +/* + * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. + * If two leaves are siblings, their ranges are adjecent and contain the same + * number of entries. In order to find out if a leaf has a sibling, we need to + * check the range corresponding to the sibling leaf. There is no need to check + * all entries in the range, we only need to check the frist and the last one. + */ +static uint64_t +check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; + uint64_t nptrs = (1 << pref_diff); + uint64_t first; + uint64_t last; + + ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + + if (zap_idx_to_blk(zap, idx, &first) != 0) + return (0); + + if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) + return (0); + + if (first != last) + return (0); + return (first); +} + +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) +{ + uint64_t blk; + + ASSERT(zap->zap_dbuf == NULL || + zap_f_phys(zap) == zap->zap_dbuf->db_data); + + /* Reality check for corrupt zap objects (leaf or header). */ + if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && + zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || + zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { + return (SET_ERROR(EIO)); + } + + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + int err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); + + ASSERT(err || + ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == + zap_leaf_phys(*lp)->l_hdr.lh_prefix); + return (err); +} + +static int +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) +{ + zap_t *zap = zn->zn_zap; + uint64_t hash = zn->zn_hash; + int err; + int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + if (zap_lock_try_upgrade(zap, tx) == 0 || + old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + /* We failed to upgrade, or need to grow the pointer table */ + zap_put_leaf(l); + *lp = l = NULL; + + zap_lock_upgrade(zap, tx); + + while (old_prefix_len == + zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + err = zap_grow_ptrtbl(zap, tx); + if (err != 0) + return (err); + } + + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { + /* it split while our locks were down */ + *lp = l; + return (0); + } + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + (old_prefix_len + 1); + uint64_t sibling = + (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + uint64_t blk; + err = zap_idx_to_blk(zap, sibling + i, &blk); + if (err != 0) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + zap_leaf_t *nl = zap_create_leaf(zap, tx); + zap_leaf_split(l, nl, zap->zap_normflags != 0); + + /* set sibling pointers */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); + ASSERT0(err); /* we checked for i/o errors above */ + } + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0); + + if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + *lp = nl; + } else { + zap_put_leaf(nl); + *lp = l; + } + + return (0); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && + zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + + zap_put_leaf(l); + + if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { + /* + * We are in the middle of growing the pointer table, or + * this leaf will soon make us grow it. + */ + zap_lock_upgrade(zap, tx); + + /* could have finished growing while our locks were down */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) + (void) zap_grow_ptrtbl(zap, tx); + } +} + +static int +fzap_checkname(zap_name_t *zn) +{ + uint32_t maxnamelen = zn->zn_normbuf_len; + uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen; + /* Only allow directory zap to have longname */ + if (len > maxnamelen || + (len > ZAP_MAXNAMELEN && + zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS)) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (SET_ERROR(EINVAL)); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (SET_ERROR(E2BIG)); + + return (0); +} + +static int +fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) +{ + int err = fzap_checkname(zn); + if (err != 0) + return (err); + return (fzap_checksize(integer_size, num_integers)); +} + +/* + * Routines for manipulating attributes. + */ +int +fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *ncp, + uint64_t *actual_num_integers) +{ + zap_leaf_t *l; + zap_entry_handle_t zeh; + + int err = fzap_checkname(zn); + if (err != 0) + return (err); + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + if ((err = fzap_checksize(integer_size, num_integers)) != 0) { + zap_put_leaf(l); + return (err); + } + + err = zap_entry_read(&zeh, integer_size, num_integers, buf); + if (err == 0 && actual_num_integers != NULL) + *actual_num_integers = zeh.zeh_num_integers; + (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); + if (ncp) { + *ncp = zap_entry_normalization_conflict(&zeh, + zn, NULL, zn->zn_zap); + } + } + + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT0(fzap_check(zn, integer_size, num_integers)); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + err = SET_ERROR(EEXIST); + goto out; + } + if (err != ENOENT) + goto out; + + err = zap_entry_create(l, zn, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tx, &l); + if (err == 0) + goto retry; + } + +out: + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + } + return (err); +} + +int +fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + int err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + return (fzap_add_cd(zn, integer_size, num_integers, + val, ZAP_NEED_CD, tx)); +} + +int +fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + boolean_t create; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + if (create) { + err = zap_entry_create(l, zn, ZAP_NEED_CD, + integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tx, &l); + if (err == 0) + goto retry; + } + + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + } + return (err); +} + +int +fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + goto out; + + if (integer_size != NULL) + *integer_size = zeh.zeh_integer_size; + if (num_integers != NULL) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_name_t *zn, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zn->zn_zap, -1, tx); + + if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && + zap_shrink_enabled) + return (zap_shrink(zn, l, tx)); + } + zap_put_leaf(l); + return (err); +} + +void +fzap_prefetch(zap_name_t *zn) +{ + uint64_t blk; + zap_t *zap = zn->zn_zap; + + uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, + zap_f_phys(zap)->zap_ptrtbl.zt_shift); + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + int bs = FZAP_BLOCK_SHIFT(zap); + dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); +} + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + + /* + * If we are reading from the beginning, we're almost certain to + * iterate over the entire ZAP object. If there are multiple leaf + * blocks (freeblk > 2), prefetch the whole object (up to + * dmu_prefetch_max bytes), so that we read the leaf blocks + * concurrently. (Unless noprefetch was requested via + * zap_cursor_init_noprefetch()). + */ + if (zc->zc_hash == 0 && zap_iterate_prefetch && + zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { + dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, + zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), + ZIO_PRIORITY_ASYNC_READ); + } + + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + + /* + * The leaf was either shrunk or split. + */ + if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + } + +again: + if (zc->zc_leaf == NULL) { + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); + } + l = zc->zc_leaf; + + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) { + zc->zc_hash = -1ULL; + zc->zc_cd = 0; + } else { + uint64_t nocare = (1ULL << + (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; + + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + + if (zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + goto again; + } + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(zap, &zeh, + za->za_name_len, za->za_name); + ASSERT0(err); + + za->za_normalization_conflict = + zap_entry_normalization_conflict(&zeh, + NULL, za->za_name, zap); + } + rw_exit(&zc->zc_leaf->l_rwlock); + return (err); +} + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (int i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_leaf_stats(zap, l, zs); + zap_put_leaf(l); + } + } +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + zs->zs_blocksize = 1ULL << bs; + + /* + * Set zap_phys_t fields + */ + zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; + zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; + zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; + zs->zs_block_type = zap_f_phys(zap)->zap_block_type; + zs->zs_magic = zap_f_phys(zap)->zap_magic; + zs->zs_salt = zap_f_phys(zap)->zap_salt; + + /* + * Set zap_ptrtbl fields + */ + zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_blks_copied = + zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); + } else { + dmu_prefetch_by_dnode(zap->zap_dnode, 0, + zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + + for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + int err; + + err = dmu_buf_hold_by_dnode(zap->zap_dnode, + (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } + } + } +} + +/* + * Find last allocated block and update freeblk. + */ +static void +zap_trunc(zap_t *zap) +{ + uint64_t nentries; + uint64_t lastblk; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { + /* External ptrtbl */ + nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); + lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + + zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + } else { + /* Embedded ptrtbl */ + nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + lastblk = 0; + } + + for (uint64_t idx = 0; idx < nentries; idx++) { + uint64_t blk; + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + if (blk > lastblk) + lastblk = blk; + } + + ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); + + zap_f_phys(zap)->zap_freeblk = lastblk + 1; +} + +/* + * ZAP shrinking algorithm. + * + * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf + * only if it has a sibling. Sibling leaves have the same prefix length and + * their prefixes differ only by the least significant (sibling) bit. We require + * both siblings to be empty. This eliminates a need to rehash the non-empty + * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl + * entries of the removed leaf to point out to the remaining leaf. Prefix length + * of the remaining leaf is decremented. As a result, it has a new prefix and it + * might have a new sibling. So, we repeat the process. + * + * Steps: + * 1. Check if a sibling leaf (sl) exists and it is empty. + * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. + * 3. Release the sibling (sl) to derefer it again with WRITER lock. + * 4. Upgrade zapdir lock to WRITER (once). + * 5. Derefer released leaves again. + * 6. If it is needed, recheck whether both leaves are still siblings and empty. + * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of + * the remaining leaf (slbit 0). + * 8. Free disk block of the removed leaf (dmu_free_range). + * 9. Decrement prefix_len of the remaining leaf. + * 10. Repeat the steps. + */ +static int +zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + uint64_t hash = zn->zn_hash; + uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + boolean_t trunc = B_FALSE; + int err = 0; + + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); + ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); + + boolean_t writer = B_FALSE; + + /* + * To avoid deadlock always deref leaves in the same order - + * sibling 0 first, then sibling 1. + */ + while (prefix_len) { + zap_leaf_t *sl; + int64_t prefix_diff = zt_shift - prefix_len; + uint64_t sl_prefix = prefix ^ 1; + uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); + int slbit = prefix & 1; + + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); + + /* + * Check if there is a sibling by reading ptrtbl ptrs. + */ + if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) + break; + + /* + * sibling 1, unlock it - we haven't yet dereferenced sibling 0. + */ + if (slbit == 1) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Dereference sibling leaf and check if it is empty. + */ + if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, + &sl)) != 0) + break; + + ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); + + /* + * Check if we have a sibling and it is empty. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || + zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { + zap_put_leaf(sl); + break; + } + + zap_put_leaf(sl); + + /* + * If there two empty sibling, we have work to do, so + * we need to lock ZAP ptrtbl as WRITER. + */ + if (!writer && (writer = zap_lock_try_upgrade(zap, tx)) == 0) { + /* We failed to upgrade */ + if (l != NULL) { + zap_put_leaf(l); + l = NULL; + } + + zap_lock_upgrade(zap, tx); + + zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + writer = B_TRUE; + } + + /* + * Here we have WRITER lock for ptrtbl. + * Now, we need a WRITER lock for both siblings leaves. + * Also, we have to recheck if the leaves are still siblings + * and still empty. + */ + if (l == NULL) { + /* sibling 0 */ + if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), + tx, RW_WRITER, &l)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) + break; + } + + /* sibling 1 */ + if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, + RW_WRITER, &sl)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { + zap_put_leaf(sl); + break; + } + + /* If we have gotten here, we have a leaf to collapse */ + uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; + uint64_t nptrs = (1ULL << prefix_diff); + uint64_t sl_blkid = sl->l_blkid; + + /* + * Set ptrtbl entries to point out to the slibling 0 blkid + */ + if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, + tx)) != 0) { + zap_put_leaf(sl); + break; + } + + /* + * Free sibling 1 disk block. + */ + int bs = FZAP_BLOCK_SHIFT(zap); + if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) + trunc = B_TRUE; + + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + sl_blkid << bs, 1 << bs, tx); + zap_put_leaf(sl); + + zap_f_phys(zap)->zap_num_leafs--; + + /* + * Update prefix and prefix_len. + */ + zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len--; + + prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + } + + if (trunc) + zap_trunc(zap); + + if (l != NULL) + zap_put_leaf(l); + + return (err); +} + +ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, + "When iterating ZAP object, prefetch it"); + +ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, + "Enable ZAP shrinking"); diff --git a/sys/contrib/openzfs/module/zfs/zap_impl.c b/sys/contrib/openzfs/module/zfs/zap_impl.c new file mode 100644 index 00000000000..0c2ba1cdbfe --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/zap_impl.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2026, TrueNAS. + */ + +#include +#include +#include +#include +#include +#include + +static kmem_cache_t *zap_name_cache; +static kmem_cache_t *zap_attr_cache; +static kmem_cache_t *zap_name_long_cache; +static kmem_cache_t *zap_attr_long_cache; + +/* Setup/teardown caches. Part of the public interface in zap.h. */ +void +zap_init(void) +{ + zap_name_cache = kmem_cache_create("zap_name", + sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, + NULL, NULL, NULL, 0); + + zap_attr_cache = kmem_cache_create("zap_attr_cache", + sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, + NULL, NULL, NULL, NULL, 0); + + zap_name_long_cache = kmem_cache_create("zap_name_long", + sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, + NULL, NULL, NULL, 0); + + zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", + sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, + NULL, NULL, NULL, NULL, 0); +} + +void +zap_fini(void) +{ + kmem_cache_destroy(zap_name_cache); + kmem_cache_destroy(zap_attr_cache); + kmem_cache_destroy(zap_name_long_cache); + kmem_cache_destroy(zap_attr_long_cache); +} + +static int +zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, + size_t outlen) +{ + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + + size_t inlen = strlen(name) + 1; + + int err = 0; + (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, + normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, + U8_UNICODE_LATEST, &err); + + return (err); +} + +zap_name_t * +zap_name_alloc(zap_t *zap, boolean_t longname) +{ + kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; + zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); + + zn->zn_zap = zap; + zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; + return (zn); +} + +zap_name_t * +zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) +{ + size_t key_len = strlen(key) + 1; + zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); + if (zap_name_init_str(zn, key, mt) != 0) { + zap_name_free(zn); + return (NULL); + } + return (zn); +} + +zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); + + ASSERT0(zap->zap_normflags); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = 0; + zn->zn_normbuf_len = ZAP_MAXNAMELEN; + + zn->zn_hash = zap_hash(zn); + return (zn); +} + +void +zap_name_free(zap_name_t *zn) +{ + if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { + kmem_cache_free(zap_name_cache, zn); + } else { + ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); + kmem_cache_free(zap_name_long_cache, zn); + } +} + +int +zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) +{ + zap_t *zap = zn->zn_zap; + size_t key_len = strlen(key) + 1; + + /* Make sure zn is allocated for longname if key is long */ + IMPLY(key_len > ZAP_MAXNAMELEN, + zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); + + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = key_len; + zn->zn_matchtype = mt; + zn->zn_normflags = zap->zap_normflags; + + /* + * If we're dealing with a case sensitive lookup on a mixed or + * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup + * will fold case to all caps overriding the lookup request. + */ + if (mt & MT_MATCH_CASE) + zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; + + if (zap->zap_normflags) { + /* + * We *must* use zap_normflags because this normalization is + * what the hash is computed from. + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zap->zap_normflags, zn->zn_normbuf_len) != 0) + return (SET_ERROR(ENOTSUP)); + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } else { + if (mt != 0) + return (SET_ERROR(ENOTSUP)); + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; + } + + zn->zn_hash = zap_hash(zn); + + if (zap->zap_normflags != zn->zn_normflags) { + /* + * We *must* use zn_normflags because this normalization is + * what the matching is based on. (Not the hash!) + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zn->zn_normflags, zn->zn_normbuf_len) != 0) + return (SET_ERROR(ENOTSUP)); + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } + + return (0); +} + +boolean_t +zap_match(zap_name_t *zn, const char *matchname) +{ + boolean_t res = B_FALSE; + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + + if (zn->zn_matchtype & MT_NORMALIZE) { + size_t namelen = zn->zn_normbuf_len; + char normbuf[ZAP_MAXNAMELEN]; + char *norm = normbuf; + + /* + * Cannot allocate this on-stack as it exceed the stack-limit of + * 1024. + */ + if (namelen > ZAP_MAXNAMELEN) + norm = kmem_alloc(namelen, KM_SLEEP); + + if (zap_normalize(zn->zn_zap, matchname, norm, + zn->zn_normflags, namelen) != 0) { + res = B_FALSE; + } else { + res = (strcmp(zn->zn_key_norm, norm) == 0); + } + if (norm != normbuf) + kmem_free(norm, namelen); + } else { + res = (strcmp(zn->zn_key_orig, matchname) == 0); + } + return (res); +} + +uint64_t +zap_hash(zap_name_t *zn) +{ + zap_t *zap = zn->zn_zap; + uint64_t h = 0; + + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (int i = 0; i < zn->zn_key_norm_numints; + wp++, i++) { + uint64_t word = *wp; + + for (int j = 0; j < 8; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + const uint8_t *cp = zn->zn_key_norm; + + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + int len = zn->zn_key_norm_numints - 1; + + ASSERT(zn->zn_key_intlen == 1); + for (int i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } + } + /* + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when + * choosing the bucket. + */ + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); + + return (h); +} + +static int +zap_lock_impl(dnode_t *dn, dmu_buf_t *db, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) +{ + ASSERT0(db->db_offset); + objset_t *os = dmu_buf_get_objset(db); + uint64_t obj = db->db_object; + + *zapp = NULL; + + if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP) + return (SET_ERROR(EINVAL)); + + zap_t *zap = dmu_buf_get_user(db); + if (zap == NULL) { + zap = mzap_open(db); + if (zap == NULL) { + /* + * mzap_open() didn't like what it saw on-disk. + * Check for corruption! + */ + return (SET_ERROR(EIO)); + } + } + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + zap->zap_dnode = dn; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && adding && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { + dprintf("upgrading obj %llu: num_entries=%u\n", + (u_longlong_t)obj, zap->zap_m.zap_num_entries); + *zapp = zap; + int err = mzap_upgrade(zapp, tx, 0); + if (err != 0) + rw_exit(&zap->zap_rwlock); + return (err); + } + VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + + if (newsz > SPA_OLD_MAXBLOCKSIZE) { + dsl_dataset_t *ds = dmu_objset_ds(os); + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_MICROZAP)) { + /* + * A microzap just grew beyond the old limit + * for the first time, so we have to ensure the + * feature flag is activated. + * zap_get_micro_max_size() won't let us get + * here if the feature is not enabled, so we + * don't need any other checks beforehand. + * + * Since we're in open context, we can't + * activate the feature directly, so we instead + * flag it on the dataset for next sync. + */ + dsl_dataset_dirty(ds, tx); + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation + [SPA_FEATURE_LARGE_MICROZAP] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + } + } + + *zapp = zap; + return (0); +} + +int +zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp) +{ + dmu_buf_t *db; + int err; + + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + err = zap_lock_impl(dn, db, tx, lti, fatreader, adding, zapp); + if (err != 0) + dmu_buf_rele(db, tag); + else + VERIFY(dnode_add_ref(dn, tag)); + return (err); +} + +int +zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, obj, tag, &dn); + if (err != 0) + return (err); + err = zap_lock_by_dnode(dn, tx, lti, fatreader, adding, tag, zapp); + dnode_rele(dn, tag); + return (err); +} + +void +zap_unlock(zap_t *zap, const void *tag) +{ + rw_exit(&zap->zap_rwlock); + dnode_rele(zap->zap_dnode, tag); + dmu_buf_rele(zap->zap_dbuf, tag); +} + +int +zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + /* Already have writer, nothing to do. */ + return (1); + + /* Try to upgrade the lock in-place. */ + if (rw_tryupgrade(&zap->zap_rwlock)) { + /* + * Got it, mark buffer dirty, since we only do that in + * zap_lock_impl() for writer. + */ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + + return (0); +} + +void +zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + if (zap_lock_try_upgrade(zap, tx)) + return; + + /* + * It's safe to drop the lock here because we still have a hold on + * zap_dbuf, which prevents the dbuf being evicted and the zap_t being + * deallocated. + */ + rw_exit(&zap->zap_rwlock); + + rw_enter(&zap->zap_rwlock, RW_WRITER); + dmu_buf_will_dirty(zap->zap_dbuf, tx); +} + +void +zap_evict_sync(void *dbu) +{ + zap_t *zap = dbu; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) + mze_destroy(zap); + else + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + + kmem_free(zap, sizeof (zap_t)); +} + +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap_f_phys(zap)->zap_flags); +} + +int +zap_hashbits(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} + +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} + +/* DNU byteswap callback for DMU_BSWAP_ZAP, see dmu_ot_byteswap. */ +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type = *(uint64_t *)buf; + + if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + } else { + fzap_byteswap(buf, size); + } +} + +/* + * Cursor attribute allocator/free. Part of the public interface in zap.h, + * in this file to get access to the kmem caches. + */ +static zap_attribute_t * +zap_attribute_alloc_impl(boolean_t longname) +{ + zap_attribute_t *za; + + za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, + KM_SLEEP); + za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; + return (za); +} + +zap_attribute_t * +zap_attribute_alloc(void) +{ + return (zap_attribute_alloc_impl(B_FALSE)); +} + +zap_attribute_t * +zap_attribute_long_alloc(void) +{ + return (zap_attribute_alloc_impl(B_TRUE)); +} + +void +zap_attribute_free(zap_attribute_t *za) +{ + if (za->za_name_len == ZAP_MAXNAMELEN) { + kmem_cache_free(zap_attr_cache, za); + } else { + ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); + kmem_cache_free(zap_attr_long_cache, za); + } +} diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c index 4e343ebf5d1..a7c9c9c03b4 100644 --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -81,284 +81,7 @@ zap_get_micro_max_size(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } -static int mzap_upgrade(zap_t **zapp, - const void *tag, dmu_tx_t *tx, zap_flags_t flags); - -uint64_t -zap_getflags(zap_t *zap) -{ - if (zap->zap_ismicro) - return (0); - return (zap_f_phys(zap)->zap_flags); -} - -int -zap_hashbits(zap_t *zap) -{ - if (zap_getflags(zap) & ZAP_FLAG_HASH64) - return (48); - else - return (28); -} - -uint32_t -zap_maxcd(zap_t *zap) -{ - if (zap_getflags(zap) & ZAP_FLAG_HASH64) - return ((1<<16)-1); - else - return (-1U); -} - -static uint64_t -zap_hash(zap_name_t *zn) -{ - zap_t *zap = zn->zn_zap; - uint64_t h = 0; - - if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { - ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); - h = *(uint64_t *)zn->zn_key_orig; - } else { - h = zap->zap_salt; - ASSERT(h != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - - if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { - const uint64_t *wp = zn->zn_key_norm; - - ASSERT(zn->zn_key_intlen == 8); - for (int i = 0; i < zn->zn_key_norm_numints; - wp++, i++) { - uint64_t word = *wp; - - for (int j = 0; j < 8; j++) { - h = (h >> 8) ^ - zfs_crc64_table[(h ^ word) & 0xFF]; - word >>= NBBY; - } - } - } else { - const uint8_t *cp = zn->zn_key_norm; - - /* - * We previously stored the terminating null on - * disk, but didn't hash it, so we need to - * continue to not hash it. (The - * zn_key_*_numints includes the terminating - * null for non-binary keys.) - */ - int len = zn->zn_key_norm_numints - 1; - - ASSERT(zn->zn_key_intlen == 1); - for (int i = 0; i < len; cp++, i++) { - h = (h >> 8) ^ - zfs_crc64_table[(h ^ *cp) & 0xFF]; - } - } - } - /* - * Don't use all 64 bits, since we need some in the cookie for - * the collision differentiator. We MUST use the high bits, - * since those are the ones that we first pay attention to when - * choosing the bucket. - */ - h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); - - return (h); -} - -static int -zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, - size_t outlen) -{ - ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); - - size_t inlen = strlen(name) + 1; - - int err = 0; - (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, - normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, - U8_UNICODE_LATEST, &err); - - return (err); -} - -boolean_t -zap_match(zap_name_t *zn, const char *matchname) -{ - boolean_t res = B_FALSE; - ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); - - if (zn->zn_matchtype & MT_NORMALIZE) { - size_t namelen = zn->zn_normbuf_len; - char normbuf[ZAP_MAXNAMELEN]; - char *norm = normbuf; - - /* - * Cannot allocate this on-stack as it exceed the stack-limit of - * 1024. - */ - if (namelen > ZAP_MAXNAMELEN) - norm = kmem_alloc(namelen, KM_SLEEP); - - if (zap_normalize(zn->zn_zap, matchname, norm, - zn->zn_normflags, namelen) != 0) { - res = B_FALSE; - } else { - res = (strcmp(zn->zn_key_norm, norm) == 0); - } - if (norm != normbuf) - kmem_free(norm, namelen); - } else { - res = (strcmp(zn->zn_key_orig, matchname) == 0); - } - return (res); -} - -static kmem_cache_t *zap_name_cache; -static kmem_cache_t *zap_attr_cache; -static kmem_cache_t *zap_name_long_cache; -static kmem_cache_t *zap_attr_long_cache; - void -zap_init(void) -{ - zap_name_cache = kmem_cache_create("zap_name", - sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, - NULL, NULL, NULL, 0); - - zap_attr_cache = kmem_cache_create("zap_attr_cache", - sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, - NULL, NULL, NULL, NULL, 0); - - zap_name_long_cache = kmem_cache_create("zap_name_long", - sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, - NULL, NULL, NULL, 0); - - zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", - sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, - NULL, NULL, NULL, NULL, 0); -} - -void -zap_fini(void) -{ - kmem_cache_destroy(zap_name_cache); - kmem_cache_destroy(zap_attr_cache); - kmem_cache_destroy(zap_name_long_cache); - kmem_cache_destroy(zap_attr_long_cache); -} - -static zap_name_t * -zap_name_alloc(zap_t *zap, boolean_t longname) -{ - kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; - zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); - - zn->zn_zap = zap; - zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; - return (zn); -} - -void -zap_name_free(zap_name_t *zn) -{ - if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { - kmem_cache_free(zap_name_cache, zn); - } else { - ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); - kmem_cache_free(zap_name_long_cache, zn); - } -} - -static int -zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) -{ - zap_t *zap = zn->zn_zap; - size_t key_len = strlen(key) + 1; - - /* Make sure zn is allocated for longname if key is long */ - IMPLY(key_len > ZAP_MAXNAMELEN, - zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); - - zn->zn_key_intlen = sizeof (*key); - zn->zn_key_orig = key; - zn->zn_key_orig_numints = key_len; - zn->zn_matchtype = mt; - zn->zn_normflags = zap->zap_normflags; - - /* - * If we're dealing with a case sensitive lookup on a mixed or - * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup - * will fold case to all caps overriding the lookup request. - */ - if (mt & MT_MATCH_CASE) - zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; - - if (zap->zap_normflags) { - /* - * We *must* use zap_normflags because this normalization is - * what the hash is computed from. - */ - if (zap_normalize(zap, key, zn->zn_normbuf, - zap->zap_normflags, zn->zn_normbuf_len) != 0) - return (SET_ERROR(ENOTSUP)); - zn->zn_key_norm = zn->zn_normbuf; - zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; - } else { - if (mt != 0) - return (SET_ERROR(ENOTSUP)); - zn->zn_key_norm = zn->zn_key_orig; - zn->zn_key_norm_numints = zn->zn_key_orig_numints; - } - - zn->zn_hash = zap_hash(zn); - - if (zap->zap_normflags != zn->zn_normflags) { - /* - * We *must* use zn_normflags because this normalization is - * what the matching is based on. (Not the hash!) - */ - if (zap_normalize(zap, key, zn->zn_normbuf, - zn->zn_normflags, zn->zn_normbuf_len) != 0) - return (SET_ERROR(ENOTSUP)); - zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; - } - - return (0); -} - -zap_name_t * -zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) -{ - size_t key_len = strlen(key) + 1; - zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); - if (zap_name_init_str(zn, key, mt) != 0) { - zap_name_free(zn); - return (NULL); - } - return (zn); -} - -static zap_name_t * -zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) -{ - zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); - - ASSERT0(zap->zap_normflags); - zn->zn_zap = zap; - zn->zn_key_intlen = sizeof (*key); - zn->zn_key_orig = zn->zn_key_norm = key; - zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; - zn->zn_matchtype = 0; - zn->zn_normbuf_len = ZAP_MAXNAMELEN; - - zn->zn_hash = zap_hash(zn); - return (zn); -} - -static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); @@ -373,19 +96,6 @@ mzap_byteswap(mzap_phys_t *buf, size_t size) } } -void -zap_byteswap(void *buf, size_t size) -{ - uint64_t block_type = *(uint64_t *)buf; - - if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { - /* ASSERT(magic == ZAP_LEAF_MAGIC); */ - mzap_byteswap(buf, size); - } else { - fzap_byteswap(buf, size); - } -} - __attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) @@ -417,7 +127,7 @@ mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) zfs_btree_add(&zap->zap_m.zap_tree, &mze); } -static mzap_ent_t * +mzap_ent_t * mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; @@ -482,7 +192,7 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) * Check if the current entry keeps the colliding entries under the fatzap leaf * size. */ -static boolean_t +boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; @@ -508,14 +218,14 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); } -static void +void mze_destroy(zap_t *zap) { zfs_btree_clear(&zap->zap_m.zap_tree); zfs_btree_destroy(&zap->zap_m.zap_tree); } -static zap_t * +zap_t * mzap_open(dmu_buf_t *db) { zap_t *winner; @@ -545,9 +255,8 @@ mzap_open(dmu_buf_t *db) } /* - * Make sure that zap_ismicro is set before we let others see - * it, because zap_lockdir() checks zap_ismicro without the lock - * held. + * Make sure that zap_ismicro is set before we let others see it, + * because zap_lock() checks zap_ismicro without the lock held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); @@ -614,163 +323,8 @@ mzap_open(dmu_buf_t *db) return (winner); } -/* - * This routine "consumes" the caller's hold on the dbuf, which must - * have the specified tag. - */ -static int -zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) -{ - ASSERT0(db->db_offset); - objset_t *os = dmu_buf_get_objset(db); - uint64_t obj = db->db_object; - - *zapp = NULL; - - if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP) - return (SET_ERROR(EINVAL)); - - zap_t *zap = dmu_buf_get_user(db); - if (zap == NULL) { - zap = mzap_open(db); - if (zap == NULL) { - /* - * mzap_open() didn't like what it saw on-disk. - * Check for corruption! - */ - return (SET_ERROR(EIO)); - } - } - - /* - * We're checking zap_ismicro without the lock held, in order to - * tell what type of lock we want. Once we have some sort of - * lock, see if it really is the right type. In practice this - * can only be different if it was upgraded from micro to fat, - * and micro wanted WRITER but fat only needs READER. - */ - krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; - rw_enter(&zap->zap_rwlock, lt); - if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { - /* it was upgraded, now we only need reader */ - ASSERT(lt == RW_WRITER); - ASSERT(RW_READER == - ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); - rw_downgrade(&zap->zap_rwlock); - lt = RW_READER; - } - - zap->zap_objset = os; - zap->zap_dnode = dn; - - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - - ASSERT3P(zap->zap_dbuf, ==, db); - - ASSERT(!zap->zap_ismicro || - zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); - if (zap->zap_ismicro && tx && adding && - zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { - uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { - dprintf("upgrading obj %llu: num_entries=%u\n", - (u_longlong_t)obj, zap->zap_m.zap_num_entries); - *zapp = zap; - int err = mzap_upgrade(zapp, tag, tx, 0); - if (err != 0) - rw_exit(&zap->zap_rwlock); - return (err); - } - VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); - zap->zap_m.zap_num_chunks = - db->db_size / MZAP_ENT_LEN - 1; - - if (newsz > SPA_OLD_MAXBLOCKSIZE) { - dsl_dataset_t *ds = dmu_objset_ds(os); - if (!dsl_dataset_feature_is_active(ds, - SPA_FEATURE_LARGE_MICROZAP)) { - /* - * A microzap just grew beyond the old limit - * for the first time, so we have to ensure the - * feature flag is activated. - * zap_get_micro_max_size() won't let us get - * here if the feature is not enabled, so we - * don't need any other checks beforehand. - * - * Since we're in open context, we can't - * activate the feature directly, so we instead - * flag it on the dataset for next sync. - */ - dsl_dataset_dirty(ds, tx); - mutex_enter(&ds->ds_lock); - ds->ds_feature_activation - [SPA_FEATURE_LARGE_MICROZAP] = - (void *)B_TRUE; - mutex_exit(&ds->ds_lock); - } - } - } - - *zapp = zap; - return (0); -} - -static int -zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, - zap_t **zapp) -{ - dmu_buf_t *db; - int err; - - err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) - dmu_buf_rele(db, tag); - else - VERIFY(dnode_add_ref(dn, tag)); - return (err); -} - int -zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, - zap_t **zapp) -{ - dnode_t *dn; - dmu_buf_t *db; - int err; - - err = dnode_hold(os, obj, tag, &dn); - if (err != 0) - return (err); - err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { - dnode_rele(dn, tag); - return (err); - } - err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { - dmu_buf_rele(db, tag); - dnode_rele(dn, tag); - } - return (err); -} - -void -zap_unlockdir(zap_t *zap, const void *tag) -{ - rw_exit(&zap->zap_rwlock); - dnode_rele(zap->zap_dnode, tag); - dmu_buf_rele(zap->zap_dbuf, tag); -} - -static int -mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) +mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; @@ -808,8 +362,7 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) zap_name_init_str(zn, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, - tag, tx)); - zap = zn->zn_zap; /* fzap_add_cd() may change zap */ + tx)); } zap_name_free(zn); vmem_free(mzp, sz); @@ -851,227 +404,20 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ - VERIFY(dnode_add_ref(dn, FTAG)); - VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, - B_FALSE, B_FALSE, &zap)); - VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); - zap_unlockdir(zap, FTAG); - } else { - dmu_buf_rele(db, FTAG); - } -} - -static uint64_t -zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) -{ - uint64_t obj; - - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - - if (allocated_dnode == NULL) { - dnode_t *dn; - obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, - &dn, FTAG, tx); - mzap_create_impl(dn, normflags, flags, tx); - dnode_rele(dn, FTAG); - } else { - obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, - allocated_dnode, tag, tx); - mzap_create_impl(*allocated_dnode, normflags, flags, tx); + VERIFY0(zap_lock_by_dnode(dn, tx, + RW_WRITER, B_FALSE, B_FALSE, FTAG, &zap)); + VERIFY0(mzap_upgrade(&zap, tx, flags)); + zap_unlock(zap, FTAG); } - return (obj); -} - -int -zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, - 0, tx)); -} - -int -zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_claim_norm_dnsize(os, obj, - 0, ot, bonustype, bonuslen, dnodesize, tx)); -} - -int -zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, - dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, - bonuslen, 0, tx)); -} - -int -zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, - dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx) -{ - dnode_t *dn; - int error; - - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, - dnodesize, tx); - if (error != 0) - return (error); - - error = dnode_hold(os, obj, FTAG, &dn); - if (error != 0) - return (error); - - mzap_create_impl(dn, normflags, 0, tx); - - dnode_rele(dn, FTAG); - - return (0); -} - -uint64_t -zap_create(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); -} - -uint64_t -zap_create_dnsize(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, - dnodesize, tx)); -} - -uint64_t -zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, - 0, tx)); -} - -uint64_t -zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_impl(os, normflags, 0, ot, 0, 0, - bonustype, bonuslen, dnodesize, NULL, NULL, tx)); -} - -uint64_t -zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_flags_dnsize(os, normflags, flags, ot, - leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); -} - -uint64_t -zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, - tx)); -} - -/* - * Create a zap object and return a pointer to the newly allocated dnode via - * the allocated_dnode argument. The returned dnode will be held and the - * caller is responsible for releasing the hold by calling dnode_rele(). - */ -uint64_t -zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) -{ - return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, - indirect_blockshift, bonustype, bonuslen, dnodesize, - allocated_dnode, tag, tx)); -} - -int -zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) -{ - /* - * dmu_object_free will free the object number and free the - * data. Freeing the data will cause our pageout function to be - * called, which will destroy our data (zap_leaf_t's and zap_t). - */ - - return (dmu_object_free(os, zapobj, tx)); -} - -void -zap_evict_sync(void *dbu) -{ - zap_t *zap = dbu; - - rw_destroy(&zap->zap_rwlock); - - if (zap->zap_ismicro) - mze_destroy(zap); - else - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - - kmem_free(zap, sizeof (zap_t)); -} - -int -zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - if (!zap->zap_ismicro) { - err = fzap_count(zap, count); - } else { - *count = zap->zap_m.zap_num_entries; - } - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_count_by_dnode(dnode_t *dn, uint64_t *count) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - if (!zap->zap_ismicro) { - err = fzap_count(zap, count); - } else { - *count = zap->zap_m.zap_num_entries; - } - zap_unlockdir(zap, FTAG); - return (err); + dmu_buf_rele(db, FTAG); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ -static boolean_t +boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, zfs_btree_index_t *idx) { @@ -1119,340 +465,7 @@ mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, return (B_FALSE); } -/* - * Routines for manipulating attributes. - */ - -int -zap_lookup(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - return (zap_lookup_norm(os, zapobj, name, integer_size, - num_integers, buf, 0, NULL, 0, NULL)); -} - -static int -zap_lookup_impl(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - int err = 0; - - zap_name_t *zn = zap_name_alloc_str(zap, name, mt); - if (zn == NULL) - return (SET_ERROR(ENOTSUP)); - - if (!zap->zap_ismicro) { - err = fzap_lookup(zn, integer_size, num_integers, buf, - realname, rn_len, ncp, NULL); - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - if (num_integers < 1) { - err = SET_ERROR(EOVERFLOW); - } else if (integer_size != 8) { - err = SET_ERROR(EINVAL); - } else { - *(uint64_t *)buf = - MZE_PHYS(zap, mze)->mze_value; - if (realname != NULL) - (void) strlcpy(realname, - MZE_PHYS(zap, mze)->mze_name, - rn_len); - if (ncp) { - *ncp = mzap_normalization_conflict(zap, - zn, mze, &idx); - } - } - } - } - zap_name_free(zn); - return (err); -} - -int -zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_impl(zap, name, integer_size, - num_integers, buf, mt, realname, rn_len, ncp); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) -{ - zap_t *zap; - int err; - zap_name_t *zn; - - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - zn = zap_name_alloc_str(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - fzap_prefetch(zn); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_prefetch_object(objset_t *os, uint64_t zapobj) -{ - int error; - dmu_object_info_t doi; - - error = dmu_object_info(os, zapobj, &doi); - if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) - error = SET_ERROR(EINVAL); - if (error == 0) - dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); - - return (error); -} - -int -zap_lookup_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - return (zap_lookup_norm_by_dnode(dn, name, integer_size, - num_integers, buf, 0, NULL, 0, NULL)); -} - -int -zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_impl(zap, name, integer_size, - num_integers, buf, mt, realname, rn_len, ncp); - zap_unlockdir(zap, FTAG); - return (err); -} - -static int -zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) -{ - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - fzap_prefetch(zn); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (0); -} - -int -zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_prefetch_uint64_impl(zap, key, key_numints); - /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_prefetch_uint64_impl(zap, key, key_numints); - /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -static int -zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, - uint64_t *actual_num_integers) -{ - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - int err = fzap_lookup(zn, integer_size, num_integers, buf, - NULL, 0, NULL, actual_num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_length_uint64_impl(zap, key, key_numints, - integer_size, num_integers, buf, NULL); - /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_length_uint64_impl(zap, key, key_numints, - integer_size, num_integers, buf, NULL); - /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf, - uint64_t *actual_num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_length_uint64_impl(zap, key, key_numints, - integer_size, num_integers, buf, actual_num_integers); - /* zap_lookup_length_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_contains(objset_t *os, uint64_t zapobj, const char *name) -{ - int err = zap_lookup_norm(os, zapobj, name, 0, - 0, NULL, 0, NULL, 0, NULL); - if (err == EOVERFLOW || err == EINVAL) - err = 0; /* found, but skipped reading the value */ - return (err); -} - -int -zap_length(objset_t *os, uint64_t zapobj, const char *name, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_str(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_length(zn, integer_size, num_integers); - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - if (integer_size) - *integer_size = 8; - if (num_integers) - *num_integers = 1; - } - } - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_length(zn, integer_size, num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_length(zn, integer_size, num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -static void +void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; @@ -1495,612 +508,6 @@ mzap_addent(zap_name_t *zn, uint64_t value) cmn_err(CE_PANIC, "out of entries!"); } -static int -zap_add_impl(zap_t *zap, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx, const void *tag) -{ - const uint64_t *intval = val; - int err = 0; - - zap_name_t *zn = zap_name_alloc_str(zap, key, 0); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - } else if (integer_size != 8 || num_integers != 1 || - strlen(key) >= MZAP_NAME_LEN || - !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { - err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); - if (err == 0) { - err = fzap_add(zn, integer_size, num_integers, val, - tag, tx); - } - zap = zn->zn_zap; /* fzap_add() may change zap */ - } else { - zfs_btree_index_t idx; - if (mze_find(zn, &idx) != NULL) { - err = SET_ERROR(EEXIST); - } else { - mzap_addent(zn, *intval); - } - } - ASSERT(zap == zn->zn_zap); - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_add(objset_t *os, uint64_t zapobj, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); - /* zap_add_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_add_by_dnode(dnode_t *dn, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); - /* zap_add_impl() calls zap_unlockdir() */ - return (err); -} - -static int -zap_add_uint64_impl(zap_t *zap, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx, const void *tag) -{ - int err; - - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_add_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_add_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_update(objset_t *os, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - const uint64_t *intval = val; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_str(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_update(zn, integer_size, num_integers, val, - FTAG, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - (u_longlong_t)zapobj, integer_size, - (u_longlong_t)num_integers, name); - err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); - if (err == 0) { - err = fzap_update(zn, integer_size, num_integers, - val, FTAG, tx); - } - zap = zn->zn_zap; /* fzap_update() may change zap */ - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze != NULL) { - MZE_PHYS(zap, mze)->mze_value = *intval; - } else { - mzap_addent(zn, *intval); - } - } - ASSERT(zap == zn->zn_zap); - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); - return (err); -} - -static int -zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, - const void *tag) -{ - int err; - - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_update(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, const void *val, - dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_update_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_update_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_update_uint64_impl(zap, key, key_numints, - integer_size, num_integers, val, tx, FTAG); - /* zap_update_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) -{ - return (zap_remove_norm(os, zapobj, name, 0, tx)); -} - -static int -zap_remove_impl(zap_t *zap, const char *name, - matchtype_t mt, dmu_tx_t *tx) -{ - int err = 0; - - zap_name_t *zn = zap_name_alloc_str(zap, name, mt); - if (zn == NULL) - return (SET_ERROR(ENOTSUP)); - if (!zap->zap_ismicro) { - err = fzap_remove(zn, tx); - } else { - zfs_btree_index_t idx; - mzap_ent_t *mze = mze_find(zn, &idx); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - zap->zap_m.zap_num_entries--; - memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); - zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); - } - } - zap_name_free(zn); - return (err); -} - -int -zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - matchtype_t mt, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - err = zap_remove_impl(zap, name, mt, tx); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - err = zap_remove_impl(zap, name, 0, tx); - zap_unlockdir(zap, FTAG); - return (err); -} - -static int -zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, - dmu_tx_t *tx, const void *tag) -{ - int err; - - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); - /* zap_remove_uint64_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, - dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); - /* zap_remove_uint64_impl() calls zap_unlockdir() */ - return (err); -} - - -static zap_attribute_t * -zap_attribute_alloc_impl(boolean_t longname) -{ - zap_attribute_t *za; - - za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, - KM_SLEEP); - za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; - return (za); -} - -zap_attribute_t * -zap_attribute_alloc(void) -{ - return (zap_attribute_alloc_impl(B_FALSE)); -} - -zap_attribute_t * -zap_attribute_long_alloc(void) -{ - return (zap_attribute_alloc_impl(B_TRUE)); -} - -void -zap_attribute_free(zap_attribute_t *za) -{ - if (za->za_name_len == ZAP_MAXNAMELEN) { - kmem_cache_free(zap_attr_cache, za); - } else { - ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); - kmem_cache_free(zap_attr_long_cache, za); - } -} - -/* - * Routines for iterating over the attributes. - */ - -static void -zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized, boolean_t prefetch) -{ - zc->zc_objset = os; - zc->zc_zap = NULL; - zc->zc_leaf = NULL; - zc->zc_zapobj = zapobj; - zc->zc_serialized = serialized; - zc->zc_hash = 0; - zc->zc_cd = 0; - zc->zc_prefetch = prefetch; -} -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) -{ - zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); -} - -/* - * Initialize a cursor at the beginning of the ZAP object. The entire - * ZAP object will be prefetched. - */ -void -zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); -} - -/* - * Initialize a cursor at the beginning, but request that we not prefetch - * the entire ZAP object. - */ -void -zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); -} - -void -zap_cursor_fini(zap_cursor_t *zc) -{ - if (zc->zc_zap) { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlockdir(zc->zc_zap, NULL); - zc->zc_zap = NULL; - } - if (zc->zc_leaf) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - zc->zc_objset = NULL; -} - -uint64_t -zap_cursor_serialize(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return (-1ULL); - if (zc->zc_zap == NULL) - return (zc->zc_serialized); - ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap))); - ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); - - /* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So usually use a small - * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits - * of the cursor. - * - * [ collision differentiator | zap_hashbits()-bit hash value ] - */ - return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | - ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); -} - -int -zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) -{ - int err; - - if (zc->zc_hash == -1ULL) - return (SET_ERROR(ENOENT)); - - if (zc->zc_zap == NULL) { - int hb; - err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); - if (err != 0) - return (err); - - /* - * To support zap_cursor_init_serialized, advance, retrieve, - * we must add to the existing zc_cd, which may already - * be 1 due to the zap_cursor_advance. - */ - ASSERT0(zc->zc_hash); - hb = zap_hashbits(zc->zc_zap); - zc->zc_hash = zc->zc_serialized << (64 - hb); - zc->zc_cd += zc->zc_serialized >> hb; - if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ - zc->zc_cd = 0; - } else { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - } - if (!zc->zc_zap->zap_ismicro) { - err = fzap_cursor_retrieve(zc->zc_zap, zc, za); - } else { - zfs_btree_index_t idx; - mzap_ent_t mze_tofind; - - mze_tofind.mze_hash = zc->zc_hash >> 32; - mze_tofind.mze_cd = zc->zc_cd; - - mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, - &mze_tofind, &idx); - if (mze == NULL) { - mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, - &idx, &idx); - } - if (mze) { - mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); - ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); - za->za_normalization_conflict = - mzap_normalization_conflict(zc->zc_zap, NULL, - mze, &idx); - za->za_integer_length = 8; - za->za_num_integers = 1; - za->za_first_integer = mzep->mze_value; - (void) strlcpy(za->za_name, mzep->mze_name, - za->za_name_len); - zc->zc_hash = (uint64_t)mze->mze_hash << 32; - zc->zc_cd = mze->mze_cd; - err = 0; - } else { - zc->zc_hash = -1ULL; - err = SET_ERROR(ENOENT); - } - } - rw_exit(&zc->zc_zap->zap_rwlock); - return (err); -} - -void -zap_cursor_advance(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return; - zc->zc_cd++; -} - -int -zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - - memset(zs, 0, sizeof (zap_stats_t)); - - if (zap->zap_ismicro) { - zs->zs_blocksize = zap->zap_dbuf->db_size; - zs->zs_num_entries = zap->zap_m.zap_num_entries; - zs->zs_num_blocks = 1; - } else { - fzap_get_stats(zap, zs); - } - zap_unlockdir(zap, FTAG); - return (0); -} - -#if defined(_KERNEL) -EXPORT_SYMBOL(zap_create); -EXPORT_SYMBOL(zap_create_dnsize); -EXPORT_SYMBOL(zap_create_norm); -EXPORT_SYMBOL(zap_create_norm_dnsize); -EXPORT_SYMBOL(zap_create_flags); -EXPORT_SYMBOL(zap_create_flags_dnsize); -EXPORT_SYMBOL(zap_create_claim); -EXPORT_SYMBOL(zap_create_claim_norm); -EXPORT_SYMBOL(zap_create_claim_norm_dnsize); -EXPORT_SYMBOL(zap_create_hold); -EXPORT_SYMBOL(zap_destroy); -EXPORT_SYMBOL(zap_lookup); -EXPORT_SYMBOL(zap_lookup_by_dnode); -EXPORT_SYMBOL(zap_lookup_norm); -EXPORT_SYMBOL(zap_lookup_uint64); -EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode); -EXPORT_SYMBOL(zap_contains); -EXPORT_SYMBOL(zap_prefetch); -EXPORT_SYMBOL(zap_prefetch_uint64); -EXPORT_SYMBOL(zap_prefetch_object); -EXPORT_SYMBOL(zap_add); -EXPORT_SYMBOL(zap_add_by_dnode); -EXPORT_SYMBOL(zap_add_uint64); -EXPORT_SYMBOL(zap_add_uint64_by_dnode); -EXPORT_SYMBOL(zap_update); -EXPORT_SYMBOL(zap_update_uint64); -EXPORT_SYMBOL(zap_update_uint64_by_dnode); -EXPORT_SYMBOL(zap_length); -EXPORT_SYMBOL(zap_length_uint64); -EXPORT_SYMBOL(zap_length_uint64_by_dnode); -EXPORT_SYMBOL(zap_remove); -EXPORT_SYMBOL(zap_remove_by_dnode); -EXPORT_SYMBOL(zap_remove_norm); -EXPORT_SYMBOL(zap_remove_uint64); -EXPORT_SYMBOL(zap_remove_uint64_by_dnode); -EXPORT_SYMBOL(zap_count); -EXPORT_SYMBOL(zap_count_by_dnode); -EXPORT_SYMBOL(zap_value_search); -EXPORT_SYMBOL(zap_join); -EXPORT_SYMBOL(zap_join_increment); -EXPORT_SYMBOL(zap_add_int); -EXPORT_SYMBOL(zap_remove_int); -EXPORT_SYMBOL(zap_lookup_int); -EXPORT_SYMBOL(zap_increment_int); -EXPORT_SYMBOL(zap_add_int_key); -EXPORT_SYMBOL(zap_lookup_int_key); -EXPORT_SYMBOL(zap_increment); -EXPORT_SYMBOL(zap_cursor_init); -EXPORT_SYMBOL(zap_cursor_fini); -EXPORT_SYMBOL(zap_cursor_retrieve); -EXPORT_SYMBOL(zap_cursor_advance); -EXPORT_SYMBOL(zap_cursor_serialize); -EXPORT_SYMBOL(zap_cursor_init_serialized); -EXPORT_SYMBOL(zap_get_stats); - ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, "Maximum micro ZAP size before converting to a fat ZAP, " "in bytes (max 1M)"); -#endif diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index fe98e7db073..a23f397e698 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -1088,6 +1088,23 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) ZFS_DELEG_PERM_CREATE, cr)); } +/* + * Policy for dataset set property operations. Individual properties checked by + * zfs_check_settable(), additionally require zfs_secpolicy_recv() when setting + * properties as part of a receive. + */ +static int +zfs_secpolicy_setprops(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + boolean_t received = zc->zc_cookie; + int error; + + if (received && (error = zfs_secpolicy_recv(zc, innvl, cr))) + return (error); + + return (zfs_secpolicy_read(zc, innvl, cr)); +} + int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { @@ -3456,12 +3473,15 @@ zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) ASSERT(spa_writeable(spa)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_set(vd, innvl, outnvl); + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); @@ -3500,12 +3520,15 @@ zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_get(vd, innvl, outnvl); + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); @@ -4120,7 +4143,6 @@ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { (void) unused, (void) outnvl; - const char *message; char *poolname; spa_t *spa; int error; @@ -4141,7 +4163,7 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) return (error); - message = fnvlist_lookup_string(innvl, "message"); + const char *message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); @@ -6647,21 +6669,27 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor + * + * The zc_nvlist_dst output array is limited to 1000 entries. */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { + const size_t batch_limit = 1000 * sizeof (zfs_useracct_t); + uint64_t bufsize = MIN(zc->zc_nvlist_dst_size, batch_limit); zfsvfs_t *zfsvfs; - int bufsize = zc->zc_nvlist_dst_size; - if (bufsize <= 0) + if (bufsize < sizeof (zfs_useracct_t)) { + zc->zc_nvlist_dst_size = sizeof (zfs_useracct_t); return (SET_ERROR(ENOMEM)); + } int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); + zc->zc_nvlist_dst_size = bufsize; error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size, &zc->zc_guid); @@ -7152,7 +7180,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) dsl_pool_t *dp; dsl_dataset_t *new, *old; const char *firstsnap; - uint64_t used, comp, uncomp; + uint64_t used = 0, comp = 0, uncomp = 0; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); @@ -8045,7 +8073,7 @@ zfs_ioctl_init(void) zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, - zfs_secpolicy_none); + zfs_secpolicy_setprops); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c index 85b7a549b9a..0b51f8669cb 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_quota.c +++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c @@ -86,10 +86,14 @@ zpl_get_file_info(dmu_object_type_t bonustype, const void *data, sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; } - VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + + if (unlikely(sa.sa_magic != SA_MAGIC)) + return (SET_ERROR(EINVAL)); int hdrsize = sa_hdrsize(&sa); - VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + + if (unlikely(hdrsize < sizeof (sa_hdr_phys_t))) + return (SET_ERROR(EINVAL)); uintptr_t data_after_hdr = (uintptr_t)data + hdrsize; zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET)); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 0fa58d5ccb6..433d27dd2d1 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -499,7 +499,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *lrp, *end; + char *lrp = NULL, *end = NULL; arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 94b44561bd9..4b7c13dd1e9 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -3830,7 +3830,6 @@ zio_ddt_write(zio_t *zio) int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - ddt_univ_phys_t *ddp = dde->dde_phys; /* * In the common cases, at this point we have a regular BP with no @@ -3861,14 +3860,6 @@ zio_ddt_write(zio_t *zio) * end of the chain and letting the sequence play out. */ - /* - * Number of DVAs in the DDT entry. If the BP is encrypted we ignore - * the third one as normal. - */ - int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); - IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); - boolean_t is_ganged = ddt_phys_is_gang(ddp, v); - /* Number of DVAs requested by the IO. */ uint8_t need_dvas = zp->zp_copies; /* Number of DVAs in outstanding writes for this dde. */ @@ -3883,6 +3874,21 @@ zio_ddt_write(zio_t *zio) if (dde_io != NULL) mutex_enter(&dde_io->dde_io_lock); + /* + * Number of DVAs in the DDT entry. If the BP is encrypted we ignore + * the third one as normal. + * + * Must be computed after taking dde_io_lock (if held) to avoid + * racing with ddt_phys_unextend() in zio_ddt_child_write_done() + * error path, which can zero DVAs under dde_io_lock. Without the + * lock, a stale have_dvas causes ddt_bp_fill() to copy a zeroed + * DVA into the BP, producing a hole that reads back as zeros. + */ + ddt_univ_phys_t *ddp = dde->dde_phys; + int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); + IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); + boolean_t is_ganged = ddt_phys_is_gang(ddp, v); + if (dde_io == NULL || dde_io->dde_lead_zio[p] == NULL) { /* * No IO outstanding, so we only need to worry about ourselves. @@ -4168,14 +4174,21 @@ zio_ddt_free(zio_t *zio) } ddt_exit(ddt); - /* - * When no entry was found, it must have been pruned, - * so we can free it now instead of decrementing the - * refcount in the DDT. - */ - if (!dde) { + if (dde) { + /* + * DDT entry found and the refcount has been decremented. + * Stop the pipeline — there is nothing more to do right now. + */ + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } else { + /* + * No DDT entry; the block must have been pruned from the + * table. Clear the DEDUP bit so it is treated as a normal + * block from here on. BRT_FREE and DVA_FREE follow in the + * pipeline and will handle any cloned references and the + * actual block free respectively. + */ BP_SET_DEDUP(bp, 0); - zio->io_pipeline |= ZIO_STAGE_DVA_FREE; } return (zio); @@ -5925,11 +5938,11 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_encrypt, zio_checksum_generate, zio_nop_write, - zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, + zio_brt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 88820ab4430..f1f27d74397 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -693,6 +693,15 @@ zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len, return (1); } + /* + * An OpenZFS compressed block must expand to exactly d_len bytes. + * ZSTD_decompressDCtx returns the decompressed size on success. + */ + if (result != d_len) { + ZSTDSTAT_BUMP(zstd_stat_dec_fail); + return (1); + } + if (level) { *level = curlevel; } diff --git a/sys/contrib/openzfs/rpm/generic/zfs.spec.in b/sys/contrib/openzfs/rpm/generic/zfs.spec.in index 48ed7bf2eb7..71923a7808e 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in @@ -525,7 +525,6 @@ systemctl --system daemon-reload >/dev/null || true %config(noreplace) %{_sysconfdir}/%{name}/zed.d/* %config(noreplace) %{_sysconfdir}/%{name}/zpool.d/* %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example -%attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/* %config(noreplace) %{_bashcompletiondir}/zfs %config(noreplace) %{_bashcompletiondir}/zpool diff --git a/sys/contrib/openzfs/scripts/Makefile.am b/sys/contrib/openzfs/scripts/Makefile.am index df2fae42fce..ed18a81b375 100644 --- a/sys/contrib/openzfs/scripts/Makefile.am +++ b/sys/contrib/openzfs/scripts/Makefile.am @@ -28,6 +28,7 @@ dist_noinst_SCRIPTS += $(scripts_scripts) endif dist_noinst_DATA += \ + %D%/coverage_report.pl \ %D%/cstyle.pl \ %D%/update_authors.pl diff --git a/sys/contrib/openzfs/scripts/coverage_report.pl b/sys/contrib/openzfs/scripts/coverage_report.pl new file mode 100755 index 00000000000..ba8dec7a8d9 --- /dev/null +++ b/sys/contrib/openzfs/scripts/coverage_report.pl @@ -0,0 +1,392 @@ +#!/usr/bin/env perl + +# SPDX-License-Identifier: MIT +# +# Copyright (c) 2025, Rob Norris +# Copyright (c) 2026, TrueNAS. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# +# usage: coverage_report.pl tests/unit/test_zap.info +# coverage_report.pl < tests/unit/test_zap.info +# +# This program takes an lcov/geninfo coverage tracefile and shows a summary +# of line, branch and function coverage for each file. It's focused on the +# specific needs of OpenZFS' unit test suite (see tests/unit/README.md) but +# it should be adaptable to any place where lcov's HTML output is too heavy +# or difficult to use (eg build/CI logs). +# +# The heart of this program is a small parser for the tracefile format as +# described in geninfo(1). The rest is concerned with constructing a useful +# colorised table output. +# + +# +# Typical output: +# +# Coverage: test_zap | By line | By branch | By function +# | Rate% Total Hit | Rate% Total Hit | Rate% Total Hit +# module/zfs/u8_textprep.c | 42.0% 802 337 | 33.5% 510 171 | 50.0% 12 6 +# module/zfs/zap.c | 52.1% 687 358 | 45.2% 250 113 | 41.1% 90 37 +# module/zfs/zap_fat.c | 87.8% 665 584 | 58.5% 446 261 | 94.6% 37 35 +# module/zfs/zap_impl.c | 81.9% 232 190 | 60.3% 146 88 | 92.0% 25 23 +# module/zfs/zap_leaf.c | 86.7% 466 404 | 69.0% 216 149 | 95.7% 23 22 +# module/zfs/zap_micro.c | 76.5% 238 182 | 54.2% 142 77 | 92.9% 14 13 +# + +use 5.010; +use warnings; +use strict; +use Cwd qw(getcwd); +use Term::ANSIColor qw(colored); + +# Setup for color output. Perl has included Term::ANSIColor since 5.6 (~2000), +# but RGB support didn't arrive until v4 in 5.17.8 (~2012). We disable colors +# outright on versions < 4, or if output is not attached to a terminal. +my $use_colors = -t \*STDOUT && $Term::ANSIColor::VERSION >= 4; + +# Palette setup. If Term::ANSIColor and the terminal advertise support for +# it, then we set up a pleasant red -> green gradient for the coverage +# percentages. If not, we scale those colors down to the older RGB-240 colors +# (0-5 for each component), which is still quite nice. +my @palette = !$use_colors ? () : map { + state $has_truecolor = + $Term::ANSIColor::VERSION >= 5 && $ENV{COLORTERM}; + my @rgb = map { hex } m/../g; + if ($has_truecolor) { + sprintf 'r%dg%db%d', @rgb; + } else { + sprintf 'rgb%d%d%d', map { $_ * 6 / 255 } @rgb; + } +} ( + # Catppuccin Latte + # https://catppuccin.com/palette/ + 'd20f39', # Red + 'e64553', # Maroon + 'fe640b', # Peach + 'df8e1d', # Yellow + '40a02b', # Green + '179299', # Teal +); + +# Test name, from the TN: field if present. +my $test_name = ''; + +# Per-file data, initially sourced from the tracefile, then augmented +my %filedata; + +# Tracking for the longest (stringified) value for each key. These are used +# later when computing the output table column width. +my %len; +sub bump_len { + my ($k, $x) = @_; + my $l = length "".$x; + $len{$k} = $l if ($len{$k} // 0) < $l; +} + +### +# Parse the tracefile into per-file data records. + +# Current working directory. Expected to be the build root. Used to remove +# the leading part of the source filenames, so its not the end of the world +# if its wrong. +my $cwd = getcwd; + +# Loop over the input +while (my $line = <>) { + state $data = {}; + chomp $line; + + # skip comments + next if $line =~ m/^#/; + + if ($line eq 'end_of_record') { + # end of this file, prep for next + $data = {}; + next; + } + + # everything else should be a KEY:VALUE line + my ($k, $v) = $line =~ m/^([A-Z]+):(.*)$/; + unless (defined $k) { + say "W: $.: malformed line: $line"; + next; + } + + if ($k eq 'TN') { + # TN:test_zap + + # Test name. This is actually per-record (a tracefile can + # carry multiple test results) but we only ever generate + # them for a single test, so we don't make any effort to + # notice or track changes. + $test_name = $v; + next; + } + + if ($k eq 'SF') { + # SF:/home/robn/code/zfs-unit/module/zfs/zap.c + + # Source file. Value is the name, and the rest of the record + # apply to it. + + # Remove the leading build root name. + my $path = $v; + $path =~ s{^$cwd/*}{}; + + # If we haven't seen this file before, create a new data + # record for it. + $filedata{$v} //= { path => $path }; + $data = $filedata{$v}; + + # Increase path column width if necessary. + bump_len('path', $path); + next; + } + + # Handle the counter keys. These are single values for the entire + # record in the file. L, FN and BR are Line, Function and Branch, + # F and H are found (ie total) and hit (ie was executed). + if (grep { $_ eq $k } qw(LF LH FNF FNH BRF BRH)) { + $data->{lc $k} = $v; + bump_len(lc $k, $v); + next; + } + + # Older versions of lcov may not emit absolute found/hit counters. To + # handle this, we maintain our own counters from other events recorded + # in the info file, which we use if we don't get an absolute count. + + if ($k eq 'DA') { + # DA:,[,] + # DA:463,0 + # DA:469,153 + my ($l, $h) = split ',', $v; + + # One DA: record per actual code line (vs comment or other + # non-executable line), so we count records, not line number. + $data->{_lf}++; + + # Only increment the hit count if the line was executed. + $data->{_lh}++ if $h > 0; + next; + } + + if ($k eq 'FN') { + # FN:,[,] + # FN:283,zap_lookup_by_dnode + + # One FN record per function + $data->{_fnf}++; + next; + } + if ($k eq 'FNDA') { + # FNDA:, + # FNDA:0,zap_lookup + # FNDA:78,zap_lookup_by_dnode + + # Only count hit if more than one execution. + my ($c) = split ',', $v; + $data->{_fnh}++ if 0+$c > 0; + next; + } + + if ($k eq 'BRDA') { + # BRDA:,[],, + # BRDA:365,0,0,- + # BRDA:365,0,1,- + my ($l, $b, $br, $c) = split ',', $v; + + # One BRDA: record per branch + $data->{_brf}++; + + # is number of times branch arm was taken, or '-' if + # never considered (eg surrounding block was never entered) + # they're both 0 for our purposes. + $c = 0 if $c eq '-'; + + # Only count hit if more than one execution. + $data->{_brh}++ if 0+$c > 0; + next; + } +} + +### +# Synthesize missing counters + +for my $file (keys %filedata) { + my $data = $filedata{$file}; + + for my $k (qw(lf lh fnf fnh brf brh)) { + # Get our own count, if one exists. + my $v = delete $data->{"_$k"} // 0; + + # If we didn't find a count in the info file, use our own. + # Note that this will also set legitimately unseen values to + # 0 (eg a source file with no branches). That's actually what + # we want. + unless (exists $data->{$k}) { + $data->{$k} = $v; + bump_len($k, $v); + } + } +} + +### +# Synthesize the "rate" percentage field from the "found" and "hit" fields. + +sub rate { + my ($data, $k, $kf, $kh) = @_; + my $rate = sprintf '%.01f%%', + $data->{$kf} ? (100 * $data->{$kh} / $data->{$kf}) : 0; + $data->{$k} = $rate; + bump_len($k, $rate); +} + +for my $file (keys %filedata) { + my $data = $filedata{$file}; + rate($data, 'lr', 'lf', 'lh'); + rate($data, 'brr', 'brf', 'brh'); + rate($data, 'fnr', 'fnf', 'fnh'); +} + +### +# Set up the header "rows". + +# We reuse our data record structure a little because outputting these needs to +# consider and sometimes contribute to column width. + +# The top row spans multiple columns. The pad functions below have extra tools +# to handle the math. +my $h1data = { + path => 'Coverage'.($test_name ? ": $test_name" : ''), + l => 'By line', + br => 'By branch', + fn => 'By function', +}; +bump_len('path', $h1data->{path}); + +# The second row is the actual header for each data column, and so may push +# the column widths out if necessary. +my $h2data = { + lr => 'Rate%', lf => 'Total', lh => 'Hit', + brr => 'Rate%', brf => 'Total', brh => 'Hit', + fnr => 'Rate%', fnf => 'Total', fnh => 'Hit', +}; +bump_len($_, $h2data->{$_}) for keys %$h2data; + +### +# Table layout + +# Internal helper for padr() and padl() below. The idea is to compute the +# effective column width, and the string we want to place in it. If it would +# fit exactly, we return the string. If not, the passed-in function is called +# with the string, its length and the column width, and it will place it +# (by adding padding on either side). +# +# Most calls take a single column key, which makes it very simple - take +# the max width for that column (from %len, set by bump_len()), and the value +# of that key in this column, and that's all of it. +# +# For the top heading row (h1data above), a list of column keys can be passed +# in. In this case, the string will be constructed as a space-separated list +# of all the keys have have a value in the data row. The column width is the +# sum of max column widths for all columns that mave a max column width, plus +# one for each space separator. This allows us to provide a separate string +# to appear in the space, with the amount of space computed from the columns +# underneath it. +# +sub _pad { + my ($fn, $data, @k) = @_; + my $str = join ' ', map { $data->{$_} // () } @k; + my $strlen = length $str; + my $colwidth = -1; + $colwidth += ($len{$_} // -1)+1 for @k; + return $strlen == $colwidth ? $str : $fn->($str, $strlen, $colwidth); +} + +# Return the value of the named fields, with space-padding added to the right. +sub padr { + _pad(sub { + my ($str, $strlen, $colwidth) = @_; + $str . (' ' x ($colwidth - $strlen)); + }, @_); +} + +# Return the value of the named fields, with space-padding added to the left. +sub padl { + _pad(sub { + my ($str, $strlen, $colwidth) = @_; + (' ' x ($colwidth - $strlen)) . $str; + }, @_); +} + +# Return the given % string, wrapped in terminal control codes that will give +# it an appropriate color from the palette. +sub colorpct { + my ($pct) = @_; + + # If colors are disabled, return the string as-is. + return $pct unless $use_colors; + + my ($n) = $pct =~ m/([0-9\.]+)/; + + # scale 0-100 into palette range + my $s = int(($#palette / 100) * $n); + my $c = $palette[$s]; + + return colored([$c], $pct); +} + +my @rows; + +# Layout the first header row +push @rows, [ + padr($h1data, 'path'), + '|', padr($h1data, 'l', 'lr', 'lf', 'lh'), + '|', padr($h1data, 'br', 'brr', 'brf', 'brh'), + '|', padr($h1data, 'fn', 'fnr', 'fnf', 'fnh'), +]; + +# Layout the second header row +push @rows, [ + padr($h2data, 'path'), + '|', padr($h2data, 'lr'), padl($h2data, 'lf'), padl($h2data, 'lh'), + '|', padr($h2data, 'brr'), padl($h2data, 'brf'), padl($h2data, 'brh'), + '|', padr($h2data, 'fnr'), padl($h2data, 'fnf'), padl($h2data, 'fnh'), +]; + +# Layout the data rows, padding colorising as appropriate. +for my $file (sort keys %filedata) { + my $data = $filedata{$file}; + + push @rows, [ + padr($data, 'path'), + '|', colorpct(padl($data, 'lr')), + padl($data, 'lf'), padl($data, 'lh'), + '|', colorpct(padl($data, 'brr')), + padl($data, 'brf'), padl($data, 'brh'), + '|', colorpct(padl($data, 'fnr')), + padl($data, 'fnf'), padl($data, 'fnh'), + ]; +} + +# And print them all out! +say "@$_" for @rows; diff --git a/sys/contrib/openzfs/tests/Makefile.am b/sys/contrib/openzfs/tests/Makefile.am index b007a3d7e5f..2002ced658c 100644 --- a/sys/contrib/openzfs/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/Makefile.am @@ -1,4 +1,5 @@ # SPDX-License-Identifier: CDDL-1.0 +include $(srcdir)/%D%/unit/Makefile.am include $(srcdir)/%D%/zfs-tests/Makefile.am diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 4c7e4e85ec0..0dda8fdfa36 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -37,7 +37,8 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', - 'alloc_class_013_pos', 'alloc_class_016_pos'] + 'alloc_class_013_pos', 'alloc_class_014_pos', 'alloc_class_015_neg', + 'alloc_class_016_pos'] tags = ['functional', 'alloc_class'] [tests/functional/append] @@ -172,9 +173,10 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_encrypted_raw', - 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', - 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', - 'zdb_backup', 'zdb_tunables'] + 'zdb_file_layout_001', 'zdb_file_layout_002', 'zdb_file_layout_003', + 'zdb_file_layout_neg', 'zdb_label_checksum', 'zdb_object_range_neg', + 'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd', + 'zdb_recover', 'zdb_recover_2', 'zdb_backup', 'zdb_tunables'] pre = post = tags = ['functional', 'cli_root', 'zdb'] @@ -269,8 +271,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', - 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', - 'zfs_mount_test_race', 'zfs_mount_recursive'] + 'zfs_mount_remount', 'zfs_mount_ro_rw', 'zfs_mount_all_fail', + 'zfs_mount_all_mountpoints', 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] @@ -571,8 +573,8 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', - 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', - 'user_property_001_pos', 'user_property_002_neg', + 'zpool_set_ashift', 'zpool_set_features', 'zpool_set_inherit', + 'vdev_set_001_pos', 'user_property_001_pos', 'user_property_002_neg', 'zpool_set_clear_userprop','vdev_set_scheduler'] tags = ['functional', 'cli_root', 'zpool_set'] @@ -715,10 +717,11 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing', - 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', - 'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_prune_leak', - 'dedup_zap_shrink'] +tests = ['dedup_bclone', 'dedup_bclone_pruned', 'dedup_fdt_create', + 'dedup_fdt_import', + 'dedup_fdt_pacing', 'dedup_legacy_create', 'dedup_legacy_import', + 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota', + 'dedup_prune', 'dedup_prune_leak', 'dedup_zap_shrink'] pre = post = tags = ['functional', 'dedup'] @@ -1023,6 +1026,15 @@ tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] +[tests/functional/send_xdr_encoding] +tests = ['xdr_bookmark_raw', 'xdr_bookmark_raw_with_write', + 'xdr_incr_from_bookmark', 'xdr_incr_from_redacted', 'xdr_raw', + 'xdr_redacted_full', 'xdr_redacted_received', + 'xdr_redacted_received_raw', 'xdr_replication', 'xdr_resume', + 'xdr_resume_bookmark_raw', 'xdr_resume_bookmark_raw_with_write', + 'xdr_resume_raw', 'xdr_resume_redacted'] +tags = ['functional', 'send_xdr_encoding'] + [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', @@ -1099,7 +1111,7 @@ tags = ['functional', 'vdev_disk'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', - 'vdev_zaps_007_pos'] + 'vdev_zaps_007_pos', 'vdev_zaps_008_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run index 11bda60a9ca..009d984f2b9 100644 --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -118,7 +118,8 @@ tags = ['functional', 'fallocate'] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple', - 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', + 'auto_spare_ashift', 'auto_spare_rotational', 'auto_spare_shared', + 'decrypt_fault', 'decompress_fault', 'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors', 'suspend_resume_single', 'suspend_draid_fgroups', 'zpool_status_-s'] diff --git a/sys/contrib/openzfs/tests/runfiles/sanity.run b/sys/contrib/openzfs/tests/runfiles/sanity.run index 936f2bcc32b..788c9b39531 100644 --- a/sys/contrib/openzfs/tests/runfiles/sanity.run +++ b/sys/contrib/openzfs/tests/runfiles/sanity.run @@ -156,7 +156,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', - 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_ro_rw', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] @@ -353,12 +353,11 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', - 'zpool_set_ashift', 'zpool_set_features'] + 'zpool_set_ashift', 'zpool_set_features', 'zpool_set_inherit'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] -tests = ['zpool_split_cliargs', 'zpool_split_devices', - 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_indirect'] +tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_indirect'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] @@ -439,12 +438,6 @@ tags = ['functional', 'features', 'large_dnode'] tests = ['gang_blocks_001_pos'] tags = ['functional', 'gang_blocks'] -[tests/functional/grow] -pre = -post = -tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] -tags = ['functional', 'grow'] - [tests/functional/history] tests = ['history_004_pos', 'history_005_neg', 'history_007_pos', 'history_009_pos'] @@ -502,12 +495,6 @@ tags = ['functional', 'nestedfs'] tests = ['nopwrite_sync', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] -[tests/functional/pool_checkpoint] -tests = ['checkpoint_conf_change', 'checkpoint_discard_many', - 'checkpoint_removal', 'checkpoint_sm_scale', 'checkpoint_twice'] -tags = ['functional', 'pool_checkpoint'] -timeout = 1800 - [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] @@ -557,13 +544,11 @@ tags = ['functional', 'reservation'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', - 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', - 'rsend_006_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', - 'rsend_014_pos', 'rsend_016_neg', 'rsend-exclude_001_pos', - 'rsend-exclude_002_pos', 'send-c_verify_contents', - 'send-c_volume', 'send-c_zstreamdump', 'send-c_recv_dedup', - 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', - 'send_encrypted_freeobjects', + 'rsend_002_pos', 'rsend_003_pos', 'rsend_009_pos', 'rsend_010_pos', + 'rsend_011_pos', 'rsend_016_neg', 'rsend-exclude_001_pos', + 'rsend-exclude_002_pos', 'send-c_volume', 'send-c_zstreamdump', + 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', + 'send_encrypted_props', 'send_encrypted_freeobjects', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_holds', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid'] @@ -644,9 +629,3 @@ tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] - -[tests/functional/pyzfs] -tests = ['pyzfs_unittest'] -pre = -post = -tags = ['functional', 'pyzfs'] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index 29d2760ccb8..a80112d914e 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -253,8 +253,9 @@ maybe = { 'renameat2/setup': ['SKIP', renameat2_reason], 'reservation/reservation_008_pos': ['FAIL', 7741], 'reservation/reservation_018_pos': ['FAIL', 5642], + 'send_xdr_encoding/xdr_bookmark_raw_with_write': ['FAIL', 18491], + 'send_xdr_encoding/xdr_resume_bookmark_raw_with_write': ['FAIL', 18491], 'snapshot/clone_001_pos': ['FAIL', known_reason], - 'snapshot/snapshot_006_pos': ['FAIL', known_reason], 'snapshot/snapshot_009_pos': ['FAIL', 7961], 'snapshot/snapshot_010_pos': ['FAIL', 7961], 'snapused/snapused_004_pos': ['FAIL', 5513], @@ -277,7 +278,6 @@ if sys.platform.startswith('freebsd'): 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'resilver/resilver_restart_001': ['FAIL', known_reason], - 'snapshot/snapshot_002_pos': ['FAIL', 14831], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', 16668], 'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason], 'bclone/bclone_crossfs_corner_cases_limited': diff --git a/sys/contrib/openzfs/tests/unit/.gitignore b/sys/contrib/openzfs/tests/unit/.gitignore new file mode 100644 index 00000000000..12a60a65666 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/.gitignore @@ -0,0 +1,4 @@ +/test_*.info +/test_*_coverage + +/test_zap diff --git a/sys/contrib/openzfs/tests/unit/Makefile.am b/sys/contrib/openzfs/tests/unit/Makefile.am new file mode 100644 index 00000000000..80fe7311c46 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/Makefile.am @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: CDDL-1.0 + +# libunit.la includes munit and any additional tools that apply to all tests +libunit_la_CFLAGS = $(AM_CFLAGS) + +noinst_LTLIBRARIES += libunit.la +libunit_la_SOURCES = \ + %D%/mock_dmu.c \ + %D%/mock_dmu.h \ + %D%/munit.c \ + %D%/munit.h \ + %D%/unit.c \ + %D%/unit.h + + +# all test binaries +UNIT_TESTS = \ + %D%/test_zap +noinst_PROGRAMS = $(UNIT_TESTS) + + +%C%_test_zap_CFLAGS = $(AM_CFLAGS) + +nodist_%C%_test_zap_SOURCES = \ + module/zfs/zap.c \ + module/zfs/zap_fat.c \ + module/zfs/zap_impl.c \ + module/zfs/zap_micro.c \ + module/zfs/zap_leaf.c \ + module/zfs/u8_textprep.c + +%C%_test_zap_SOURCES = \ + %D%/test_zap.c + +%C%_test_zap_LDADD = \ + libspl.la \ + libbtree.la \ + libunit.la + + +# test run and coverage targets below +PHONY += unit unit-coverage unit-coverage-html + +_unit_run_%: %D%/% + @echo " UNITTEST $<" ; $< $(TOPT) + +# note: any changes in switches to lcov or genhtml must be carefully checked +# on 1.x and 2.x; the current option set is carefully chosen to allow +# both to work sensibly + +# .info is marked PRECIOUS, because its usually only created as an intermediate +# from one of the unit phony targets, but once it exists there's no point +# remaking it until and unless the test binary itself changes +.PRECIOUS: %D%/%.info +%D%/%.info: %D%/% + @-${RM} $@ + @${LCOV} --quiet --quiet --zerocounters --directory $(top_srcdir) + @echo " UNITTEST $<" ; $< $(TOPT) + @${LCOV} --quiet --quiet --capture \ + --test-name $(notdir $<) \ + --directory $(top_srcdir) \ + --output-file $@ \ + --rc lcov_branch_coverage=1 \ + --rc geninfo_unexecuted_blocks=1 \ + $(addprefix --include $(abs_top_builddir)/, $(call \ + $(join $(join nodist_%C%_, $(notdir $<)), _SOURCES))) \ + 2>/dev/null + +_unit_coverage_%: %D%/%.info + @scripts/coverage_report.pl $< + +_unit_coverage_html_%: %D%/%.info + @-${RM} -r $(subst .info,_coverage, $<) + @${GENHTML} --quiet -quiet \ + --rc lcov_branch_coverage=1 \ + --rc check_data_consistency=0 \ + --output-directory $(subst .info,_coverage, $<) \ + $< \ + 2>/dev/null + @echo "coverage results:" \ + "file://$(realpath %D%)/$(subst .info,_coverage,$(notdir $<))/index.html" + +CLEAN_LOCAL += unit-clean-local +unit-clean-local: + -${RM} -r %D%/*.info %D%/*_coverage/ + +_UNIT_ALL_TARGETS = $(notdir $(UNIT_TESTS)) +_UNIT_FIND_TARGET = \ + $(foreach cmd, $(UNIT_TESTS), \ + $(if $(filter $(join test_, $(1)), $(notdir $(cmd))), \ + $(notdir $(cmd)))) + +_UNIT_TARGETS = $(if $(T), \ + $(call _UNIT_FIND_TARGET, $(T)), $(call _UNIT_ALL_TARGETS)) + +unit: $(addprefix _unit_run_, $(_UNIT_TARGETS)) + @$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false) + +if CODE_COVERAGE_ENABLED +unit-coverage: $(addprefix _unit_coverage_, $(_UNIT_TARGETS)) + @$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false) +unit-coverage-html: $(addprefix _unit_coverage_html_, $(_UNIT_TARGETS)) + @$(if $^, true, echo "ERROR: couldn't find unit test: $(T)" && false) +else +unit-coverage: + @echo "unit test coverage not enabled." + @echo "re-run configure with --enable-code-coverage" + @false +unit-coverage-html: unit-coverage +endif diff --git a/sys/contrib/openzfs/tests/unit/README.md b/sys/contrib/openzfs/tests/unit/README.md new file mode 100644 index 00000000000..6a4ee095af2 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/README.md @@ -0,0 +1,217 @@ +# Unit tests + +> [!NOTE] +> +> This document is a draft. It will be updated as we gain experience writing +> and running unit tests. + +This directory contains a unit testing framework for OpenZFS, and a collection +of unit tests. + +## Building and running + +The unit tests are built by default as part of the regular userspace build, so +you probably don’t have to do anything else. + +The easiest way to run the tests is to run `make unit`, which will run all the +available tests. + +``` +$ make unit + UNITTEST tests/unit/test_zap +Running test suite with seed 0x9d36890b... +zap.mock_microzap_sanity [ OK ] [ 0.00001088 / 0.00000939 CPU ] +zap.mock_fatzap_sanity [ OK ] [ 0.00004281 / 0.00004257 CPU ] +zap.zap_basic + type=micro [ OK ] [ 0.00001899 / 0.00001893 CPU ] + type=fat [ OK ] [ 0.00004174 / 0.00004135 CPU ] +4 of 4 (100%) tests successful, 0 (0%) test skipped. +``` + +Running a single test binary is possible with the `T=` param to `make unit`. + +``` +$ make unit T=zap + UNITTEST tests/unit/test_zap + ... +``` + +The test binaries are just normal programs in `./tests/unit`, and can be run +directly. This is useful for debugging with `gdb`. + +``` +$ ./tests/unit/test_zap +Running test suite with seed 0x18e131ac... +... +``` + +The test framework provides various options for controlling how the tests are +run. Add the `--help` switch for more info. If using the make rule, options can +be passed via the `TOPT=` param. + +### Building just for tests + +Recommended “minimum” build for just the unit tests, with additional debug to +assist with understanding issues. + +``` +./configure \ + --with-config=user \ + --enable-debug --enable-debuginfo \ + --disable-sysvinit --disable-systemd --disable-pam --disable-pyzfs +make -j$(nproc) +``` + +TODO: add `--with-config=unit` that disables _everything_ not needed for the +tests + +### Generating a coverage report + +If `configure` was run with `--enable-code-coverage`, then two additional build +targets are available that will run the requested tests and produce a report. + +The `unit-coverage` target runs `scripts/coverage_report.pl` to produce a +coverage summary directly in text immediately after the test output, and is +good for inclusion in log files and other build system output. + +``` +$ make unit-coverage T=zap + UNITTEST tests/unit/test_zap +Running test suite with seed 0xf51efca9... +zap.mock_microzap_sanity [ OK ] [ 0.00000941 / 0.00000834 CPU ] +zap.mock_fatzap_sanity [ OK ] [ 0.00005782 / 0.00005766 CPU ] +... +zap.cursor_release_one + type=micro [ OK ] [ 0.00001705 / 0.00001681 CPU ] + type=fat [ OK ] [ 0.00004748 / 0.00004738 CPU ] +30 of 30 (100%) tests successful, 0 (0%) test skipped. +Coverage: test_zap | By line | By branch | By function + | Rate% Total Hit | Rate% Total Hit | Rate% Total Hit +module/zfs/u8_textprep.c | 0.0% 802 0 | 0.0% 510 0 | 0.0% 12 0 +module/zfs/zap.c | 33.9% 610 207 | 31.1% 238 74 | 23.0% 74 17 +module/zfs/zap_fat.c | 47.1% 665 313 | 29.8% 446 133 | 62.2% 37 23 +module/zfs/zap_impl.c | 57.8% 232 134 | 39.7% 146 58 | 72.0% 25 18 +module/zfs/zap_leaf.c | 60.9% 466 284 | 41.2% 216 89 | 78.3% 23 18 +module/zfs/zap_micro.c | 68.9% 238 164 | 41.5% 142 59 | 92.9% 14 13 +``` + +The `unit-coverage-html` will use `lcov` and `genhtml` to generate an +interactive HTML report that also can show the specific source lines that are +covered. + +``` +$ make unit-coverage-html T=zap + UNITTEST tests/unit/test_zap +Running test suite with seed 0x485bf2e2... +zap.mock_microzap_sanity [ OK ] [ 0.00000935 / 0.00000794 CPU ] +zap.mock_fatzap_sanity [ OK ] [ 0.00006050 / 0.00006025 CPU ] +... +zap.cursor_release_one + type=micro [ OK ] [ 0.00001785 / 0.00001767 CPU ] + type=fat [ OK ] [ 0.00005262 / 0.00005250 CPU ] +30 of 30 (100%) tests successful, 0 (0%) test skipped. +coverage results: +file:///home/robn/code/zfs-unit/tests/unit/tests/unit/test_zap_coverage/index.ht +ml +``` + +Currently the coverage data will only be regenerated when the test binary +itself changes. To force it, use `make unit-clean-local` to remove the coverage +data. + +## Guidance for test writers + +### Top five + +* Only bring in the source files under test. +* Use mocks to create the test scenario, then interrogate them to understand +the result. +* Prefer more smaller tests over fewer bigger ones. +* Use coverage reports to guide test development. +* Do the simplest possible thing. + +### Test structure + +Tests should be as simple and as readable as possible. When a test fails, we +want to avoid the possibility that it could be the test itself at fault rather +than the system under test. + +* Aim for one source file per subsystem or source concept (eg ZAP). +* Aim for one test function per API call or logical behaviour + * Each “version” or “mode” of an API call or behaviour is a separate test + * Don’t test more than one thing in the same test; a test shouldn’t rely on + state or results from an earlier test +* Use test parameters for “class“ or ”vtable” -type APIs, where each + implementation should respond to API calls the same way + +### Build system + +The build setup `tests/unit/Makefile.am` is very similar to the other +userspace, however it has a couple of differences to make the run and coverage +targets work more smoothly. + +* Name the test program `test_foo`. Almost always, you will have one source + file with the actual tests in it, called `test_foo.c`. +* Add the program to `UNIT_TESTS`. `noinst_PROGRAMS` will be populated from it, + but this gives a specific name the run and coverage targets can use to + resolve the `T=` parameter to a specific test. +* List the source files under test in `nodist_%C%_test_foo_SOURCES`, and the + source files for the test itself in `%C%_test_foo_SOURCES`. This is + important, as the coverage targets use `nodist_%C%_ ... _SOURCES` as the list + of objects to include in the coverage output. + +### Mocks + +A “mock” struct is a fake version of some data structure that the subsystem +under test will accept and use as though it was a real one. + +* Make mock structs opaque. All uses from the test suite should be through + specific named accessor functions. +* Name a mock struct for the struct it is mimicking, prefixed with `mock_`. eg + `mock_dnode_t` is the mock for `dnode_t`. +* Access functions should be named for the struct, eg the function to create a + `mock_dnode_t` is `mock_dnode_t *mock_dnode_create(...)`. +* `mock_*` functions should always use the mock type name in its signature, + never the original. +* The mock object should always be directly castable to its real type and + vice-versa, ie a `mock_dnode_t *` is always usable wherever a `dnode_t *` + is (within the domain of the subsystem under test). + +This guidance pushes the programmer towards being explicit at the possible +expense of concision. This is in service of keeping the tests reliable; in +particular, if mocks require explicit casting to use, then there’s far less +chance of either a mock or a real object being used incorrectly in the test, +which can be confusing. + +### Unit testing framework + +[µnit](https://nemequ.github.io/munit/) (aka munit) is the unit test framework. +It is a relatively niche choice, and arguably abandoned by upstream, but is +well constructed with a thoughtful feature set and some useful properties: + +* Just two source files we can easily carry in the repo. +* Portable, including to Windows. +* Each test is run in a forked process, so a test failure will not corrupt the + rest of the test suite run +* Parameterised tests. +* A large suite of assertions and other useful functions that make it easy to + integrate with. + +All OpenZFS unit tests are ultimately targeting munit, so its expected that +they will use various features as needed. However, we also supply our own +facilities to extend those in useful ways. + +#### Local extensions + +`unit.h` provides a handful of macros. The majority of these are aliases for +the much longer munit names for same function, eg `unit_true(n)` is an alias +for `munit_assert_true(n)`, `unit_eq(a,b)` is an alias for +`munit_assert_uint64(a, ==, b)`, and so on. These are there so that the +assertions do not dominate the test visually, as we want it to be easier to +focus on the details. + +Similarly, the `UINT_TEST` and `UNIT_PARAM` macros exist to help with test +definition, as the casts are a little complicated. + +The goal is to keep this set relatively small, but all of munit is there for +use, so do extend it if necessary. diff --git a/sys/contrib/openzfs/tests/unit/mock_dmu.c b/sys/contrib/openzfs/tests/unit/mock_dmu.c new file mode 100644 index 00000000000..ae035498da6 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/mock_dmu.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2026, TrueNAS. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "mock_dmu.h" +#include "unit.h" + +/* + * A mock dbuf. A real dmu_buf_t (first for casting) plus the attached user + * data pointer. Block data is stored in a separate allocation so that the + * struct address remains stable across block resizes. + */ +struct mock_dbuf { + dmu_buf_t mdb_db; + dmu_buf_user_t *mdb_user; + mock_dnode_t *mdb_owner; + void *mdb_data; +}; +typedef struct mock_dbuf mock_dbuf_t; + +/* + * A mock dnode. a real dnode_t (must be first for casting) with dn_type + * and dn_object set, plus a flat array of mock_dbuf_t indexed by block id. + */ +struct mock_dnode { + dnode_t mdn_dn; + uint64_t mdn_refcount; + size_t mdn_blksize; + size_t mdn_nblocks; + mock_dbuf_t **mdn_blocks; +}; + +/* + * A mock transaction. We only allocate and zero it, nothing currently uses + * any of its internals. + */ +struct mock_dmu_tx { + dmu_tx_t mtx_tx; +}; + +/* Mock dnode */ + +static mock_dbuf_t * +mock_dnode_block_alloc(mock_dnode_t *mdn, uint64_t blkid) +{ + mock_dbuf_t *mdb = kmem_zalloc(sizeof (mock_dbuf_t), KM_SLEEP); + mdb->mdb_data = kmem_zalloc(mdn->mdn_blksize, KM_SLEEP); + + mdb->mdb_db.db_object = mdn->mdn_dn.dn_object; + mdb->mdb_db.db_offset = blkid * mdn->mdn_blksize; + mdb->mdb_db.db_size = mdn->mdn_blksize; + mdb->mdb_db.db_data = mdb->mdb_data; + mdb->mdb_owner = mdn; + + return (mdb); +} + +/* Grow the dbuf array if needed, then return (or create) the dbuf for blkid. */ +static mock_dbuf_t * +mock_dnode_block_get(mock_dnode_t *mdn, uint64_t blkid) +{ + if (blkid >= mdn->mdn_nblocks) { + size_t new_n = blkid + 1; + mock_dbuf_t **new_blocks = + kmem_zalloc(new_n * sizeof (mock_dbuf_t *), KM_SLEEP); + if (mdn->mdn_blocks != NULL) { + memcpy(new_blocks, mdn->mdn_blocks, + mdn->mdn_nblocks * sizeof (mock_dbuf_t *)); + kmem_free(mdn->mdn_blocks, + mdn->mdn_nblocks * sizeof (mock_dbuf_t *)); + } + mdn->mdn_blocks = new_blocks; + mdn->mdn_nblocks = new_n; + } + + mock_dbuf_t *mdb = mdn->mdn_blocks[blkid]; + if (mdb == NULL) { + mdb = mock_dnode_block_alloc(mdn, blkid); + mdn->mdn_blocks[blkid] = mdb; + } + return (mdb); +} + +mock_dnode_t * +mock_dnode_create(size_t blksize, dmu_object_type_t type) +{ + ASSERT(IS_P2ALIGNED(blksize, 512)); + + mock_dnode_t *mdn = kmem_zalloc(sizeof (mock_dnode_t), KM_SLEEP); + mdn->mdn_refcount = 1; + mdn->mdn_dn.dn_type = type; + mdn->mdn_dn.dn_object = 1; /* arbitrary non-zero object number */ + mdn->mdn_blksize = blksize; + + return (mdn); +} + +void +mock_dnode_destroy(mock_dnode_t *mdn) +{ + for (size_t i = 0; i < mdn->mdn_nblocks; i++) { + mock_dbuf_t *mdb = mdn->mdn_blocks[i]; + if (mdb == NULL) + continue; + + /* + * Call the sync evict callback if one is set, mimicking the + * real DMU when a buffer's refcount drops to zero. + */ + if (mdb->mdb_user != NULL && + mdb->mdb_user->dbu_evict_func_sync != NULL) + mdb->mdb_user->dbu_evict_func_sync(mdb->mdb_user); + + kmem_free(mdb->mdb_data, mdb->mdb_db.db_size); + kmem_free(mdb, sizeof (mock_dbuf_t)); + } + + kmem_free(mdn->mdn_blocks, + mdn->mdn_nblocks * sizeof (mock_dbuf_t *)); + kmem_free(mdn, sizeof (mock_dnode_t)); +} + +size_t +mock_dnode_block_count(mock_dnode_t *mdn) +{ + return (mdn->mdn_nblocks); +} + +const void * +mock_dnode_block_data(mock_dnode_t *mdn, uint64_t blkid) +{ + if (blkid >= mdn->mdn_nblocks) + return (NULL); + return (mdn->mdn_blocks[blkid]->mdb_db.db_data); +} + +uint64_t +mock_dnode_refcount(mock_dnode_t *mdn) +{ + return (mdn->mdn_refcount); +} + +/* Mock transaction */ + +mock_dmu_tx_t * +mock_tx_create(void) +{ + return (kmem_zalloc(sizeof (mock_dmu_tx_t), KM_SLEEP)); +} + +void +mock_tx_destroy(mock_dmu_tx_t *tx) +{ + kmem_free(tx, sizeof (mock_dmu_tx_t)); +} + +/* DMU stubs, either no-op or light access to mock dnode internals. */ + +int +dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, + dmu_buf_t **dbp, dmu_flags_t flags) +{ + (void) tag; (void) flags; + + mock_dnode_t *mdn = (mock_dnode_t *)dn; + uint64_t blkid = offset / mdn->mdn_blksize; + mock_dbuf_t *mdb = mock_dnode_block_get(mdn, blkid); + + *dbp = &mdb->mdb_db; + return (0); +} + +void +dmu_buf_rele(dmu_buf_t *db, const void *tag) +{ + (void) db; (void) tag; +} + +void * +dmu_buf_get_user(dmu_buf_t *db) +{ + mock_dbuf_t *mdb = (mock_dbuf_t *)db; + return (mdb->mdb_user); +} + +void * +dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *new_user) +{ + mock_dbuf_t *mdb = (mock_dbuf_t *)db; + if (mdb->mdb_user != NULL) + return (mdb->mdb_user); /* existing user wins */ + mdb->mdb_user = new_user; + return (NULL); /* new_user wins */ +} + +void +dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx) +{ + (void) db; (void) tx; +} + +objset_t * +dmu_buf_get_objset(dmu_buf_t *db) +{ + mock_dbuf_t *mdb = (mock_dbuf_t *)db; + + /* + * We return the mock_dnode_t pointer cast to objset_t so that + * dmu_object_set_blocksize() below can recover the dnode without + * needing a separate objset structure. + */ + return ((objset_t *)mdb->mdb_owner); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx) +{ + (void) object; (void) ibs; (void) tx; + + /* os is a mock_dnode_t (see dmu_buf_get_objset() above). */ + mock_dnode_t *mdn = (mock_dnode_t *)os; + + /* + * Resize block 0's data buffer in place so the struct address stays + * stable. + */ + mock_dbuf_t *mdb = mdn->mdn_blocks[0]; + void *new_data = kmem_zalloc(size, KM_SLEEP); + memcpy(new_data, mdb->mdb_data, + MIN(size, (size_t)mdb->mdb_db.db_size)); + kmem_free(mdb->mdb_data, mdb->mdb_db.db_size); + + mdb->mdb_data = new_data; + mdb->mdb_db.db_size = size; + mdb->mdb_db.db_data = new_data; + mdn->mdn_blksize = size; + + return (0); +} + +boolean_t +dnode_add_ref(dnode_t *dn, const void *tag) +{ + (void) tag; + mock_dnode_t *mdn = (mock_dnode_t *)dn; + if (mdn->mdn_refcount == 0) + return (B_FALSE); + mdn->mdn_refcount++; + return (B_TRUE); +} + +void +dnode_rele(dnode_t *dn, const void *tag) +{ + (void) tag; + mock_dnode_t *mdn = (mock_dnode_t *)dn; + unit_gt(mdn->mdn_refcount, 0); + mdn->mdn_refcount--; +} + +/* + * Misc other stubs. Not strictly DMU mocks, and might move elsewhere later, + * but for now this is all we need for our limited test set. + */ + +spa_t * +dmu_objset_spa(objset_t *os) +{ + (void) os; + return (NULL); +} + +int +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + (void) os; (void) object; (void) offset; (void) size; (void) tx; + return (0); +} + +void +dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + (void) dn; (void) level; (void) offset; (void) len; (void) pri; +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + (void) os; + return (NULL); +} + +boolean_t +dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f) +{ + (void) ds; (void) f; + return (B_FALSE); +} + +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + (void) ds; (void) tx; +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, spa_feature_t f) +{ + (void) spa; (void) f; + return (B_FALSE); +} + +int +spa_maxblocksize(spa_t *spa) +{ + (void) spa; + return (SPA_OLD_MAXBLOCKSIZE); +} + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; + +void +byteswap_uint64_array(void *buf, size_t size) +{ + (void) buf; (void) size; +} + +/* + * Various objset+object calls; returning error, as they need to use + * _by_dnode() variants to get the mock. + */ +int +dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp) +{ + (void) os; (void) object; (void) tag; (void) dnp; + return (EIO); +} + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + (void) os; (void) object; (void) tx; + return (EIO); +} + +uint64_t +dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, + int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, + int bonuslen, int dnodesize, dnode_t **allocated_dnode, + const void *tag, dmu_tx_t *tx) +{ + (void) os; (void) ot; (void) blocksize; (void) indirect_blockshift; + (void) bonustype; (void) bonuslen; (void) dnodesize; + (void) allocated_dnode; (void) tag; (void) tx; + return (EIO); +} + +int +dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx) +{ + (void) os; (void) object; (void) ot; (void) blocksize; + (void) bonus_type; (void) bonus_len; (void) dnodesize; (void) tx; + return (EIO); +} + +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + (void) os; (void) object; (void) doi; + return (EIO); +} + +int +dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, + uint64_t len) +{ + (void) os; (void) object; (void) offset; (void) len; + return (EIO); +} diff --git a/sys/contrib/openzfs/tests/unit/mock_dmu.h b/sys/contrib/openzfs/tests/unit/mock_dmu.h new file mode 100644 index 00000000000..2ac82c18b7a --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/mock_dmu.h @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2026, TrueNAS. + */ + +#ifndef _MOCK_DMU_H +#define _MOCK_DMU_H + +/* + * In-memory mock of the core DMU types for unit testing. + * + * Provides mock_dnode_t carrying a flat array of fixed-size blocks. + */ + +#include + +typedef struct mock_dnode mock_dnode_t; +typedef struct mock_dmu_tx mock_dmu_tx_t; + +/* Create a mock dnode with the given block size and object type. */ +mock_dnode_t *mock_dnode_create(size_t blksize, dmu_object_type_t type); + +/* Free a mock dnode and all its blocks. */ +void mock_dnode_destroy(mock_dnode_t *mdn); + +/* Returns the current number of blocks underlying this dnode. */ +size_t mock_dnode_block_count(mock_dnode_t *mdn); + +/* Returns a pointer to the data under the given block id. */ +const void *mock_dnode_block_data(mock_dnode_t *mdn, uint64_t blkid); + +/* Returns the current dnode ref (hold) count. */ +uint64_t mock_dnode_refcount(mock_dnode_t *mdn); + +/* Create/destroy a mock transaction handle. */ +mock_dmu_tx_t *mock_tx_create(void); +void mock_tx_destroy(mock_dmu_tx_t *tx); + +#endif /* _MOCK_DMU_H */ diff --git a/sys/contrib/openzfs/tests/unit/munit.c b/sys/contrib/openzfs/tests/unit/munit.c new file mode 100644 index 00000000000..73d32728e8c --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/munit.c @@ -0,0 +1,2458 @@ +// SPDX-License-Identifier: MIT +/* µnit Testing Framework + * Copyright (c) 2013-2018 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*** Configuration ***/ + +/* This is just where the output from the test goes. It's really just + * meant to let you choose stdout or stderr, but if anyone really want + * to direct it to a file let me know, it would be fairly easy to + * support. */ +#if !defined(MUNIT_OUTPUT_FILE) +# define MUNIT_OUTPUT_FILE stdout +#endif + +/* This is a bit more useful; it tells µnit how to format the seconds in + * timed tests. If your tests run for longer you might want to reduce + * it, and if your computer is really fast and your tests are tiny you + * can increase it. */ +#if !defined(MUNIT_TEST_TIME_FORMAT) +# define MUNIT_TEST_TIME_FORMAT "0.8f" +#endif + +/* If you have long test names you might want to consider bumping + * this. The result information takes 43 characters. */ +#if !defined(MUNIT_TEST_NAME_LEN) +# define MUNIT_TEST_NAME_LEN 37 +#endif + +/* If you don't like the timing information, you can disable it by + * defining MUNIT_DISABLE_TIMING. */ +#if !defined(MUNIT_DISABLE_TIMING) +# define MUNIT_ENABLE_TIMING +#endif + +/* OpenZFS: claim no strerror_r, causing munit to use its own internal + * fallback. There are two version of strerror_r (XSI and GNU), subtly + * different, and some glibc versions have warn_unused_result set on the + * prototype. munit is not prepared for this variance, so better just to + * let it do its own thing. -- robn, 2026-05-21 */ +#if !defined(MUNIT_NO_STRERROR_R) +# define MUNIT_NO_STRERROR_R +#endif + +/*** End configuration ***/ + +#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L) +# undef _POSIX_C_SOURCE +#endif +#if !defined(_POSIX_C_SOURCE) +# define _POSIX_C_SOURCE 200809L +#endif + +/* Solaris freaks out if you try to use a POSIX or SUS standard without + * the "right" C standard. */ +#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif + +#if defined(__STDC_VERSION__) +# if __STDC_VERSION__ >= 201112L +# define _XOPEN_SOURCE 700 +# elif __STDC_VERSION__ >= 199901L +# define _XOPEN_SOURCE 600 +# endif +#endif + +/* Because, according to Microsoft, POSIX is deprecated. You've got + * to appreciate the chutzpah. */ +#if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE) +# define _CRT_NONSTDC_NO_DEPRECATE +#endif + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) +# include +#elif defined(_WIN32) +/* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32) +# define MUNIT_NL_LANGINFO +# include +# include +# include +#endif + +#if !defined(_WIN32) +# include +# include +# include +#else +# include +# include +# include +# if !defined(STDERR_FILENO) +# define STDERR_FILENO _fileno(stderr) +# endif +#endif + +#include "munit.h" + +#define MUNIT_STRINGIFY(x) #x +#define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x) + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) +# define MUNIT_THREAD_LOCAL __thread +#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) || \ + defined(_Thread_local) +# define MUNIT_THREAD_LOCAL _Thread_local +#elif defined(_WIN32) +# define MUNIT_THREAD_LOCAL __declspec(thread) +#endif + +/* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... } + * while (0)', or 'do { ... } while (1)'. I'm pretty sure nobody + * at Microsoft compiles with /W4. */ +#if defined(_MSC_VER) && (_MSC_VER <= 1800) +# pragma warning(disable : 4127) +#endif + +#if defined(_WIN32) || defined(__EMSCRIPTEN__) +# define MUNIT_NO_FORK +#endif + +#if defined(__EMSCRIPTEN__) +# define MUNIT_NO_BUFFER +#endif + +/*** Logging ***/ + +static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO; +static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR; + +#if defined(MUNIT_THREAD_LOCAL) +static MUNIT_THREAD_LOCAL munit_bool munit_error_jmp_buf_valid = 0; +static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf; +#endif + +/* At certain warning levels, mingw will trigger warnings about + * suggesting the format attribute, which we've explicity *not* set + * because it will then choke on our attempts to use the MS-specific + * I64 modifier for size_t (which we have to use since MSVC doesn't + * support the C99 z modifier). */ + +#if defined(__MINGW32__) || defined(__MINGW64__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +#endif + +MUNIT_PRINTF(5, 0) +static void munit_logf_exv(MunitLogLevel level, FILE *fp, const char *filename, + int line, const char *format, va_list ap) { + if (level < munit_log_level_visible) + return; + + switch (level) { + case MUNIT_LOG_DEBUG: + fputs("Debug", fp); + break; + case MUNIT_LOG_INFO: + fputs("Info", fp); + break; + case MUNIT_LOG_WARNING: + fputs("Warning", fp); + break; + case MUNIT_LOG_ERROR: + fputs("Error", fp); + break; + default: + munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)", + level); + return; + } + + fputs(": ", fp); + if (filename != NULL) + fprintf(fp, "%s:%d: ", filename, line); + vfprintf(fp, format, ap); + fputc('\n', fp); +} + +MUNIT_PRINTF(3, 4) +static void munit_logf_internal(MunitLogLevel level, FILE *fp, + const char *format, ...) { + va_list ap; + + va_start(ap, format); + munit_logf_exv(level, fp, NULL, 0, format, ap); + va_end(ap); +} + +static void munit_log_internal(MunitLogLevel level, FILE *fp, + const char *message) { + munit_logf_internal(level, fp, "%s", message); +} + +void munit_logf_ex(MunitLogLevel level, const char *filename, int line, + const char *format, ...) { + va_list ap; + + va_start(ap, format); + munit_logf_exv(level, stderr, filename, line, format, ap); + va_end(ap); + + if (level >= munit_log_level_fatal) { +#if defined(MUNIT_THREAD_LOCAL) + if (munit_error_jmp_buf_valid) + longjmp(munit_error_jmp_buf, 1); +#endif + abort(); + } +} + +void munit_errorf_ex(const char *filename, int line, const char *format, ...) { + va_list ap; + + va_start(ap, format); + munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap); + va_end(ap); + +#if defined(MUNIT_THREAD_LOCAL) + if (munit_error_jmp_buf_valid) + longjmp(munit_error_jmp_buf, 1); +#endif + abort(); +} + +#if defined(__MINGW32__) || defined(__MINGW64__) +# pragma GCC diagnostic pop +#endif + +#if !defined(MUNIT_STRERROR_LEN) +# define MUNIT_STRERROR_LEN 80 +#endif + +static void munit_log_errno(MunitLogLevel level, FILE *fp, const char *msg) { +#if defined(MUNIT_NO_STRERROR_R) || \ + (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)) + munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno); +#else + char munit_error_str[MUNIT_STRERROR_LEN]; + munit_error_str[0] = '\0'; + +# if !defined(_WIN32) + strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN); +# else + strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno); +# endif + + munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno); +#endif +} + +/*** Memory allocation ***/ + +void *munit_malloc_ex(const char *filename, int line, size_t size) { + void *ptr; + + if (size == 0) + return NULL; + + ptr = calloc(1, size); + if (MUNIT_UNLIKELY(ptr == NULL)) { + munit_logf_ex(MUNIT_LOG_ERROR, filename, line, + "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size); + } + + return ptr; +} + +/*** Timer code ***/ + +#if defined(MUNIT_ENABLE_TIMING) + +# define psnip_uint64_t munit_uint64_t +# define psnip_uint32_t munit_uint32_t + +/* Code copied from portable-snippets + * . If you need to + * change something, please do it there so we can keep the code in + * sync. */ + +/* Clocks (v1) + * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * https://creativecommons.org/publicdomain/zero/1.0/ + */ + +# if !defined(PSNIP_CLOCK_H) +# define PSNIP_CLOCK_H + +# if !defined(psnip_uint64_t) +# include "../exact-int/exact-int.h" +# endif + +# if !defined(PSNIP_CLOCK_STATIC_INLINE) +# if defined(__GNUC__) +# define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__)) +# else +# define PSNIP_CLOCK__COMPILER_ATTRIBUTES +# endif + +# define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static +# endif + +enum PsnipClockType { + /* This clock provides the current time, in units since 1970-01-01 + * 00:00:00 UTC not including leap seconds. In other words, UNIX + * time. Keep in mind that this clock doesn't account for leap + * seconds, and can go backwards (think NTP adjustments). */ + PSNIP_CLOCK_TYPE_WALL = 1, + /* The CPU time is a clock which increases only when the current + * process is active (i.e., it doesn't increment while blocking on + * I/O). */ + PSNIP_CLOCK_TYPE_CPU = 2, + /* Monotonic time is always running (unlike CPU time), but it only + ever moves forward unless you reboot the system. Things like NTP + adjustments have no effect on this clock. */ + PSNIP_CLOCK_TYPE_MONOTONIC = 3 +}; + +struct PsnipClockTimespec { + psnip_uint64_t seconds; + psnip_uint64_t nanoseconds; +}; + +/* Methods we support: */ + +# define PSNIP_CLOCK_METHOD_CLOCK_GETTIME 1 +# define PSNIP_CLOCK_METHOD_TIME 2 +# define PSNIP_CLOCK_METHOD_GETTIMEOFDAY 3 +# define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER 4 +# define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME 5 +# define PSNIP_CLOCK_METHOD_CLOCK 6 +# define PSNIP_CLOCK_METHOD_GETPROCESSTIMES 7 +# define PSNIP_CLOCK_METHOD_GETRUSAGE 8 +# define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME 9 +# define PSNIP_CLOCK_METHOD_GETTICKCOUNT64 10 + +# include + +# if defined(HEDLEY_UNREACHABLE) +# define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE() +# else +# define PSNIP_CLOCK_UNREACHABLE() assert(0) +# endif + +/* Choose an implementation */ + +/* #undef PSNIP_CLOCK_WALL_METHOD */ +/* #undef PSNIP_CLOCK_CPU_METHOD */ +/* #undef PSNIP_CLOCK_MONOTONIC_METHOD */ + +/* We want to be able to detect the libc implementation, so we include + ( isn't available everywhere). */ + +# if defined(__unix__) || defined(__unix) || defined(__linux__) +# include +# include +# endif + +# if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) +/* These are known to work without librt. If you know of others + * please let us know so we can add them. */ +# if (defined(__GLIBC__) && \ + (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) || \ + (defined(__FreeBSD__)) +# define PSNIP_CLOCK_HAVE_CLOCK_GETTIME +# elif !defined(PSNIP_CLOCK_NO_LIBRT) +# define PSNIP_CLOCK_HAVE_CLOCK_GETTIME +# endif +# endif + +# if defined(_WIN32) +# if !defined(PSNIP_CLOCK_CPU_METHOD) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES +# endif +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) +# define PSNIP_CLOCK_MONOTONIC_METHOD \ + PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER +# endif +# endif + +# if defined(__MACH__) && !defined(__gnu_hurd__) +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) +# define PSNIP_CLOCK_MONOTONIC_METHOD \ + PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME +# endif +# endif + +# if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME) +# include +# if !defined(PSNIP_CLOCK_WALL_METHOD) +# if defined(CLOCK_REALTIME_PRECISE) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE +# elif !defined(__sun) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME +# endif +# endif +# if !defined(PSNIP_CLOCK_CPU_METHOD) +# if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID +# elif defined(CLOCK_VIRTUAL) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL +# endif +# endif +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) +# if defined(CLOCK_MONOTONIC_RAW) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC +# elif defined(CLOCK_MONOTONIC_PRECISE) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE +# elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC +# endif +# endif +# endif + +# if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L) +# if !defined(PSNIP_CLOCK_WALL_METHOD) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY +# endif +# endif + +# if !defined(PSNIP_CLOCK_WALL_METHOD) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME +# endif + +# if !defined(PSNIP_CLOCK_CPU_METHOD) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK +# endif + +/* Primarily here for testing. */ +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + defined(PSNIP_CLOCK_REQUIRE_MONOTONIC) +# error No monotonic clock found. +# endif + +/* Implementations */ + +# if (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ + (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME)) +# include +# endif + +# if (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) +# include +# endif + +# if (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == \ + PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ + (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) +# include +# endif + +# if (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) +# include +# include +# endif + +# if (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == \ + PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) +# include +# include +# include +# endif + +/*** Implementations ***/ + +# define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t)(1000000000ULL)) + +# if (defined(PSNIP_CLOCK_CPU_METHOD) && \ + (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && \ + (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock__clock_getres(clockid_t clk_id) { + struct timespec res; + int r; + + r = clock_getres(clk_id, &res); + if (r != 0) + return 0; + + return (psnip_uint32_t)(PSNIP_CLOCK_NSEC_PER_SEC / + (psnip_uint64_t)res.tv_nsec); +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock__clock_gettime(clockid_t clk_id, struct PsnipClockTimespec *res) { + struct timespec ts; + + if (clock_gettime(clk_id, &ts) != 0) + return -10; + + res->seconds = (psnip_uint64_t)(ts.tv_sec); + res->nanoseconds = (psnip_uint64_t)(ts.tv_nsec); + + return 0; +} +# endif + +PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_wall_get_precision(void) { +# if !defined(PSNIP_CLOCK_WALL_METHOD) + return 0; +# elif defined(PSNIP_CLOCK_WALL_METHOD) && \ + PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL); +# elif defined(PSNIP_CLOCK_WALL_METHOD) && \ + PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY + return 1000000; +# elif defined(PSNIP_CLOCK_WALL_METHOD) && \ + PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME + return 1; +# else + return 0; +# endif +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock_wall_get_time(struct PsnipClockTimespec *res) { +# if !defined(PSNIP_CLOCK_WALL_METHOD) + (void)res; + + return -2; +# elif defined(PSNIP_CLOCK_WALL_METHOD) && \ + PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res); +# elif defined(PSNIP_CLOCK_WALL_METHOD) && \ + PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME + res->seconds = time(NULL); + res->nanoseconds = 0; +# elif defined(PSNIP_CLOCK_WALL_METHOD) && \ + PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY + struct timeval tv; + + if (gettimeofday(&tv, NULL) != 0) + return -6; + + res->seconds = (psnip_uint64_t)tv.tv_sec; + res->nanoseconds = (psnip_uint64_t)tv.tv_usec * 1000; +# else + (void)res; + + return -2; +# endif + + return 0; +} + +PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_cpu_get_precision(void) { +# if !defined(PSNIP_CLOCK_CPU_METHOD) + return 0; +# elif defined(PSNIP_CLOCK_CPU_METHOD) && \ + PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU); +# elif defined(PSNIP_CLOCK_CPU_METHOD) && \ + PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK + return CLOCKS_PER_SEC; +# elif defined(PSNIP_CLOCK_CPU_METHOD) && \ + PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES + return PSNIP_CLOCK_NSEC_PER_SEC / 100; +# else + return 0; +# endif +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock_cpu_get_time(struct PsnipClockTimespec *res) { +# if !defined(PSNIP_CLOCK_CPU_METHOD) + (void)res; + return -2; +# elif defined(PSNIP_CLOCK_CPU_METHOD) && \ + PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res); +# elif defined(PSNIP_CLOCK_CPU_METHOD) && \ + PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK + clock_t t = clock(); + if (t == ((clock_t)-1)) + return -5; + res->seconds = t / CLOCKS_PER_SEC; + res->nanoseconds = + (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC); +# elif defined(PSNIP_CLOCK_CPU_METHOD) && \ + PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES + FILETIME CreationTime, ExitTime, KernelTime, UserTime; + LARGE_INTEGER date, adjust; + + if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime, + &KernelTime, &UserTime)) + return -7; + + /* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */ + date.HighPart = (LONG)UserTime.dwHighDateTime; + date.LowPart = UserTime.dwLowDateTime; + adjust.QuadPart = 11644473600000 * 10000; + date.QuadPart -= adjust.QuadPart; + + res->seconds = (psnip_uint64_t)(date.QuadPart / 10000000); + res->nanoseconds = (psnip_uint64_t)(date.QuadPart % 10000000) * + (PSNIP_CLOCK_NSEC_PER_SEC / 100); +# elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage) != 0) + return -8; + + res->seconds = usage.ru_utime.tv_sec; + res->nanoseconds = tv.tv_usec * 1000; +# else + (void)res; + return -2; +# endif + + return 0; +} + +PSNIP_CLOCK__FUNCTION psnip_uint32_t psnip_clock_monotonic_get_precision(void) { +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) + return 0; +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC); +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME + static mach_timebase_info_data_t tbi = { + 0, + }; + if (tbi.denom == 0) + mach_timebase_info(&tbi); + return (psnip_uint32_t)(tbi.numer / tbi.denom); +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 + return 1000; +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == \ + PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER + LARGE_INTEGER Frequency; + QueryPerformanceFrequency(&Frequency); + return (psnip_uint32_t)((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) + ? PSNIP_CLOCK_NSEC_PER_SEC + : Frequency.QuadPart); +# else + return 0; +# endif +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock_monotonic_get_time(struct PsnipClockTimespec *res) { +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) + (void)res; + return -2; +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res); +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME + psnip_uint64_t nsec = mach_absolute_time(); + static mach_timebase_info_data_t tbi = { + 0, + }; + if (tbi.denom == 0) + mach_timebase_info(&tbi); + nsec *= ((psnip_uint64_t)tbi.numer) / ((psnip_uint64_t)tbi.denom); + res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC; + res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC; +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == \ + PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER + LARGE_INTEGER t, f; + if (QueryPerformanceCounter(&t) == 0) + return -12; + + QueryPerformanceFrequency(&f); + res->seconds = (psnip_uint64_t)(t.QuadPart / f.QuadPart); + res->nanoseconds = (psnip_uint64_t)(t.QuadPart % f.QuadPart); + if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) + res->nanoseconds /= (psnip_uint64_t)f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC; + else + res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / (psnip_uint64_t)f.QuadPart; +# elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && \ + PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 + const ULONGLONG msec = GetTickCount64(); + res->seconds = msec / 1000; + res->nanoseconds = sec % 1000; +# else + return -2; +# endif + + return 0; +} + +/* Returns the number of ticks per second for the specified clock. + * For example, a clock with millisecond precision would return 1000, + * and a clock with 1 second (such as the time() function) would + * return 1. + * + * If the requested clock isn't available, it will return 0. + * Hopefully this will be rare, but if it happens to you please let us + * know so we can work on finding a way to support your system. + * + * Note that different clocks on the same system often have a + * different precisions. + */ +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock_get_precision(enum PsnipClockType clock_type) { + switch (clock_type) { + case PSNIP_CLOCK_TYPE_MONOTONIC: + return psnip_clock_monotonic_get_precision(); + case PSNIP_CLOCK_TYPE_CPU: + return psnip_clock_cpu_get_precision(); + case PSNIP_CLOCK_TYPE_WALL: + return psnip_clock_wall_get_precision(); + } + + PSNIP_CLOCK_UNREACHABLE(); + return 0; +} + +/* Set the provided timespec to the requested time. Returns 0 on + * success, or a negative value on failure. */ +PSNIP_CLOCK__FUNCTION int psnip_clock_get_time(enum PsnipClockType clock_type, + struct PsnipClockTimespec *res) { + assert(res != NULL); + + switch (clock_type) { + case PSNIP_CLOCK_TYPE_MONOTONIC: + return psnip_clock_monotonic_get_time(res); + case PSNIP_CLOCK_TYPE_CPU: + return psnip_clock_cpu_get_time(res); + case PSNIP_CLOCK_TYPE_WALL: + return psnip_clock_wall_get_time(res); + } + + return -1; +} + +# endif /* !defined(PSNIP_CLOCK_H) */ + +static psnip_uint64_t munit_clock_get_elapsed(struct PsnipClockTimespec *start, + struct PsnipClockTimespec *end) { + psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC; + if (end->nanoseconds < start->nanoseconds) { + return r - (start->nanoseconds - end->nanoseconds); + } + + return r + (end->nanoseconds - start->nanoseconds); +} + +#else +# include +#endif /* defined(MUNIT_ENABLE_TIMING) */ + +/*** PRNG stuff ***/ + +/* This is (unless I screwed up, which is entirely possible) the + * version of PCG with 32-bit state. It was chosen because it has a + * small enough state that we should reliably be able to use CAS + * instead of requiring a lock for thread-safety. + * + * If I did screw up, I probably will not bother changing it unless + * there is a significant bias. It's really not important this be + * particularly strong, as long as it is fairly random it's much more + * important that it be reproducible, so bug reports have a better + * chance of being reproducible. */ + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) && \ + (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ > 8)) +# define HAVE_STDATOMIC +#elif defined(__clang__) +# if __has_extension(c_atomic) +# define HAVE_CLANG_ATOMICS +# endif +#endif + +/* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */ +#if defined(__clang__) && defined(_WIN32) +# undef HAVE_STDATOMIC +# if defined(__c2__) +# undef HAVE_CLANG_ATOMICS +# endif +#endif + +#if defined(_OPENMP) +# define ATOMIC_UINT32_T uint32_t +#elif defined(HAVE_STDATOMIC) +# include +# define ATOMIC_UINT32_T _Atomic uint32_t +#elif defined(HAVE_CLANG_ATOMICS) +# define ATOMIC_UINT32_T _Atomic uint32_t +#elif defined(_WIN32) +# define ATOMIC_UINT32_T volatile LONG +#else +# define ATOMIC_UINT32_T volatile uint32_t +#endif + +static ATOMIC_UINT32_T munit_rand_state = 42; + +#if defined(_OPENMP) +static inline void munit_atomic_store(ATOMIC_UINT32_T *dest, + ATOMIC_UINT32_T value) { +# pragma omp critical(munit_atomics) + *dest = value; +} + +static inline uint32_t munit_atomic_load(ATOMIC_UINT32_T *src) { + int ret; +# pragma omp critical(munit_atomics) + ret = *src; + return ret; +} + +static inline uint32_t munit_atomic_cas(ATOMIC_UINT32_T *dest, + ATOMIC_UINT32_T *expected, + ATOMIC_UINT32_T desired) { + munit_bool ret; + +# pragma omp critical(munit_atomics) + { + if (*dest == *expected) { + *dest = desired; + ret = 1; + } else { + ret = 0; + } + } + + return ret; +} +#elif defined(HAVE_STDATOMIC) +# define munit_atomic_store(dest, value) atomic_store(dest, value) +# define munit_atomic_load(src) atomic_load(src) +# define munit_atomic_cas(dest, expected, value) \ + atomic_compare_exchange_weak(dest, expected, value) +#elif defined(HAVE_CLANG_ATOMICS) +# define munit_atomic_store(dest, value) \ + __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST) +# define munit_atomic_load(src) __c11_atomic_load(src, __ATOMIC_SEQ_CST) +# define munit_atomic_cas(dest, expected, value) \ + __c11_atomic_compare_exchange_weak(dest, expected, value, \ + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) +#elif defined(__GNUC__) && (__GNUC__ > 4) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) +# define munit_atomic_store(dest, value) \ + __atomic_store_n(dest, value, __ATOMIC_SEQ_CST) +# define munit_atomic_load(src) __atomic_load_n(src, __ATOMIC_SEQ_CST) +# define munit_atomic_cas(dest, expected, value) \ + __atomic_compare_exchange_n(dest, expected, value, 1, __ATOMIC_SEQ_CST, \ + __ATOMIC_SEQ_CST) +#elif defined(__GNUC__) && (__GNUC__ >= 4) +# define munit_atomic_store(dest, value) \ + do { \ + *(dest) = (value); \ + } while (0) +# define munit_atomic_load(src) (*(src)) +# define munit_atomic_cas(dest, expected, value) \ + __sync_bool_compare_and_swap(dest, *expected, value) +#elif defined(_WIN32) /* Untested */ +# define munit_atomic_store(dest, value) \ + do { \ + *(dest) = (value); \ + } while (0) +# define munit_atomic_load(src) (*(src)) +# define munit_atomic_cas(dest, expected, value) \ + InterlockedCompareExchange((dest), (value), *(expected)) +#else +# warning No atomic implementation, PRNG will not be thread-safe +# define munit_atomic_store(dest, value) \ + do { \ + *(dest) = (value); \ + } while (0) +# define munit_atomic_load(src) (*(src)) +static inline munit_bool munit_atomic_cas(ATOMIC_UINT32_T *dest, + ATOMIC_UINT32_T *expected, + ATOMIC_UINT32_T desired) { + if (*dest == *expected) { + *dest = desired; + return 1; + } else { + return 0; + } +} +#endif + +#define MUNIT_PRNG_MULTIPLIER (747796405U) +#define MUNIT_PRNG_INCREMENT (1729U) + +static munit_uint32_t munit_rand_next_state(munit_uint32_t state) { + return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT; +} + +static munit_uint32_t munit_rand_from_state(munit_uint32_t state) { + munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U); + res ^= res >> 22; + return res; +} + +void munit_rand_seed(munit_uint32_t seed) { + munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); + munit_atomic_store(&munit_rand_state, state); +} + +static munit_uint32_t munit_rand_generate_seed(void) { + munit_uint32_t seed, state; +#if defined(MUNIT_ENABLE_TIMING) + struct PsnipClockTimespec wc = { + 0, + }; + + psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc); + seed = (munit_uint32_t)wc.nanoseconds; +#else + seed = (munit_uint32_t)time(NULL); +#endif + + state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); + return munit_rand_from_state(state); +} + +static munit_uint32_t munit_rand_state_uint32(munit_uint32_t *state) { + const munit_uint32_t old = *state; + *state = munit_rand_next_state(old); + return munit_rand_from_state(old); +} + +munit_uint32_t munit_rand_uint32(void) { + munit_uint32_t old, state; + + do { + old = munit_atomic_load(&munit_rand_state); + state = munit_rand_next_state(old); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); + + return munit_rand_from_state(old); +} + +static void munit_rand_state_memory(munit_uint32_t *state, size_t size, + munit_uint8_t *data) { + size_t members_remaining = size / sizeof(munit_uint32_t); + size_t bytes_remaining = size % sizeof(munit_uint32_t); + munit_uint8_t *b = data; + munit_uint32_t rv; + while (members_remaining-- > 0) { + rv = munit_rand_state_uint32(state); + memcpy(b, &rv, sizeof(munit_uint32_t)); + b += sizeof(munit_uint32_t); + } + if (bytes_remaining != 0) { + rv = munit_rand_state_uint32(state); + memcpy(b, &rv, bytes_remaining); + } +} + +void munit_rand_memory(size_t size, munit_uint8_t *data) { + munit_uint32_t old, state; + + do { + state = old = munit_atomic_load(&munit_rand_state); + munit_rand_state_memory(&state, size, data); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); +} + +static munit_uint32_t munit_rand_state_at_most(munit_uint32_t *state, + munit_uint32_t salt, + munit_uint32_t max) { + /* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same + * as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not + * to avoid compiler warnings. + */ + const munit_uint32_t min = (~max + 1U) % max; + munit_uint32_t x; + + if (max == (~((munit_uint32_t)0U))) + return munit_rand_state_uint32(state) ^ salt; + + max++; + + do { + x = munit_rand_state_uint32(state) ^ salt; + } while (x < min); + + return x % max; +} + +static munit_uint32_t munit_rand_at_most(munit_uint32_t salt, + munit_uint32_t max) { + munit_uint32_t old, state; + munit_uint32_t retval; + + do { + state = old = munit_atomic_load(&munit_rand_state); + retval = munit_rand_state_at_most(&state, salt, max); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); + + return retval; +} + +int munit_rand_int_range(int min, int max) { + munit_uint64_t range = (munit_uint64_t)max - (munit_uint64_t)min; + + if (min > max) + return munit_rand_int_range(max, min); + + if (range > (~((munit_uint32_t)0U))) + range = (~((munit_uint32_t)0U)); + + return min + (int)munit_rand_at_most(0, (munit_uint32_t)range); +} + +double munit_rand_double(void) { + munit_uint32_t old, state; + double retval = 0.0; + + do { + state = old = munit_atomic_load(&munit_rand_state); + + /* See http://mumble.net/~campbell/tmp/random_real.c for how to do + * this right. Patches welcome if you feel that this is too + * biased. */ + retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t)0U)) + 1.0); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); + + return retval; +} + +/*** Test suite handling ***/ + +typedef struct { + unsigned int successful; + unsigned int skipped; + unsigned int failed; + unsigned int errored; +#if defined(MUNIT_ENABLE_TIMING) + munit_uint64_t cpu_clock; + munit_uint64_t wall_clock; +#endif +} MunitReport; + +typedef struct { + const char *prefix; + const MunitSuite *suite; + const char **tests; + munit_uint32_t seed; + unsigned int iterations; + MunitParameter *parameters; + munit_bool single_parameter_mode; + void *user_data; + MunitReport report; + munit_bool colorize; + munit_bool fork; + munit_bool show_stderr; + munit_bool fatal_failures; +} MunitTestRunner; + +const char *munit_parameters_get(const MunitParameter params[], + const char *key) { + const MunitParameter *param; + + for (param = params; param != NULL && param->name != NULL; param++) + if (strcmp(param->name, key) == 0) + return param->value; + return NULL; +} + +#if defined(MUNIT_ENABLE_TIMING) +static void munit_print_time(FILE *fp, munit_uint64_t nanoseconds) { + fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT, + ((double)nanoseconds) / ((double)PSNIP_CLOCK_NSEC_PER_SEC)); +} +#endif + +/* Add a paramter to an array of parameters. */ +static MunitResult munit_parameters_add(size_t *params_size, + MunitParameter **params, char *name, + char *value) { + *params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2)); + if (*params == NULL) + return MUNIT_ERROR; + + (*params)[*params_size].name = name; + (*params)[*params_size].value = value; + (*params_size)++; + (*params)[*params_size].name = NULL; + (*params)[*params_size].value = NULL; + + return MUNIT_OK; +} + +/* Concatenate two strings, but just return one of the components + * unaltered if the other is NULL or "". */ +static char *munit_maybe_concat(size_t *len, char *prefix, char *suffix) { + char *res; + size_t res_l; + const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0; + const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0; + if (prefix_l == 0 && suffix_l == 0) { + res = NULL; + res_l = 0; + } else if (prefix_l == 0 && suffix_l != 0) { + res = suffix; + res_l = suffix_l; + } else if (prefix_l != 0 && suffix_l == 0) { + res = prefix; + res_l = prefix_l; + } else { + res_l = prefix_l + suffix_l; + res = malloc(res_l + 1); + memcpy(res, prefix, prefix_l); + memcpy(res + prefix_l, suffix, suffix_l); + res[res_l] = 0; + } + + if (len != NULL) + *len = res_l; + + return res; +} + +/* Possbily free a string returned by munit_maybe_concat. */ +static void munit_maybe_free_concat(char *s, const char *prefix, + const char *suffix) { + if (prefix != s && suffix != s) + free(s); +} + +/* Cheap string hash function, just used to salt the PRNG. */ +static munit_uint32_t munit_str_hash(const char *name) { + const char *p; + munit_uint32_t h = 5381U; + + for (p = name; *p != '\0'; p++) + h = (munit_uint32_t)(h << 5) + h + (munit_uint32_t)*p; + + return h; +} + +static void munit_splice(int from, int to) { + munit_uint8_t buf[1024]; +#if !defined(_WIN32) + ssize_t len; + ssize_t bytes_written; + ssize_t write_res; +#else + int len; + int bytes_written; + int write_res; +#endif + do { + len = read(from, buf, sizeof(buf)); + if (len > 0) { + bytes_written = 0; + do { + write_res = write(to, buf + bytes_written, +#if !defined(_WIN32) + (size_t) +#else + (unsigned int) +#endif + (len - bytes_written)); + if (write_res < 0) + break; + bytes_written += write_res; + } while (bytes_written < len); + } else + break; + } while (1); +} + +/* This is the part that should be handled in the child process */ +static MunitResult munit_test_runner_exec(MunitTestRunner *runner, + const MunitTest *test, + const MunitParameter params[], + MunitReport *report) { + unsigned int iterations = runner->iterations; + MunitResult result = MUNIT_FAIL; +#if defined(MUNIT_ENABLE_TIMING) + struct PsnipClockTimespec wall_clock_begin = + { + 0, + }, + wall_clock_end = { + 0, + }; + struct PsnipClockTimespec cpu_clock_begin = + { + 0, + }, + cpu_clock_end = { + 0, + }; +#endif + unsigned int i = 0; + + if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) == + MUNIT_TEST_OPTION_SINGLE_ITERATION) + iterations = 1; + else if (iterations == 0) + iterations = runner->suite->iterations; + + munit_rand_seed(runner->seed); + + do { + void *data = (test->setup == NULL) ? runner->user_data + : test->setup(params, runner->user_data); + +#if defined(MUNIT_ENABLE_TIMING) + psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin); + psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin); +#endif + + result = test->test(params, data); + +#if defined(MUNIT_ENABLE_TIMING) + psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end); + psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end); +#endif + + if (test->tear_down != NULL) + test->tear_down(data); + + if (MUNIT_LIKELY(result == MUNIT_OK)) { + report->successful++; +#if defined(MUNIT_ENABLE_TIMING) + report->wall_clock += + munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end); + report->cpu_clock += + munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end); +#endif + } else { + switch ((int)result) { + case MUNIT_SKIP: + report->skipped++; + break; + case MUNIT_FAIL: + report->failed++; + break; + case MUNIT_ERROR: + report->errored++; + break; + default: + break; + } + break; + } + } while (++i < iterations); + + return result; +} + +#if defined(MUNIT_EMOTICON) +# define MUNIT_RESULT_STRING_OK ":)" +# define MUNIT_RESULT_STRING_SKIP ":|" +# define MUNIT_RESULT_STRING_FAIL ":(" +# define MUNIT_RESULT_STRING_ERROR ":o" +# define MUNIT_RESULT_STRING_TODO ":/" +#else +# define MUNIT_RESULT_STRING_OK "OK " +# define MUNIT_RESULT_STRING_SKIP "SKIP " +# define MUNIT_RESULT_STRING_FAIL "FAIL " +# define MUNIT_RESULT_STRING_ERROR "ERROR" +# define MUNIT_RESULT_STRING_TODO "TODO " +#endif + +static void munit_test_runner_print_color(const MunitTestRunner *runner, + const char *string, char color) { + if (runner->colorize) + fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string); + else + fputs(string, MUNIT_OUTPUT_FILE); +} + +#if !defined(MUNIT_NO_BUFFER) +static int munit_replace_stderr(FILE *stderr_buf) { + if (stderr_buf != NULL) { + const int orig_stderr = dup(STDERR_FILENO); + + int errfd = fileno(stderr_buf); + if (MUNIT_UNLIKELY(errfd == -1)) { + exit(EXIT_FAILURE); + } + + dup2(errfd, STDERR_FILENO); + + return orig_stderr; + } + + return -1; +} + +static void munit_restore_stderr(int orig_stderr) { + if (orig_stderr != -1) { + dup2(orig_stderr, STDERR_FILENO); + close(orig_stderr); + } +} +#endif /* !defined(MUNIT_NO_BUFFER) */ + +/* Run a test with the specified parameters. */ +static void +munit_test_runner_run_test_with_params(MunitTestRunner *runner, + const MunitTest *test, + const MunitParameter params[]) { + MunitResult result = MUNIT_OK; + MunitReport report = {0, 0, 0, 0, +#if defined(MUNIT_ENABLE_TIMING) + 0, 0 +#endif + }; + unsigned int output_l; + munit_bool first; + const MunitParameter *param; + FILE *stderr_buf; +#if !defined(MUNIT_NO_FORK) + int pipefd[2]; + pid_t fork_pid; + ssize_t bytes_written = 0; + ssize_t write_res; + ssize_t bytes_read = 0; + ssize_t read_res; + int status = 0; + pid_t changed_pid; +#endif + + if (params != NULL) { + output_l = 2; + fputs(" ", MUNIT_OUTPUT_FILE); + first = 1; + for (param = params; param != NULL && param->name != NULL; param++) { + if (!first) { + fputs(", ", MUNIT_OUTPUT_FILE); + output_l += 2; + } else { + first = 0; + } + + output_l += (unsigned int)fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name, + param->value); + } + while (output_l++ < MUNIT_TEST_NAME_LEN) { + fputc(' ', MUNIT_OUTPUT_FILE); + } + } + + fflush(MUNIT_OUTPUT_FILE); + + stderr_buf = NULL; +#if !defined(_WIN32) || defined(__MINGW32__) + stderr_buf = tmpfile(); +#else + tmpfile_s(&stderr_buf); +#endif + if (stderr_buf == NULL) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, + "unable to create buffer for stderr"); + result = MUNIT_ERROR; + goto print_result; + } + +#if !defined(MUNIT_NO_FORK) + if (runner->fork) { + pipefd[0] = -1; + pipefd[1] = -1; + if (pipe(pipefd) != 0) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe"); + result = MUNIT_ERROR; + goto print_result; + } + + fork_pid = fork(); + if (fork_pid == 0) { + int orig_stderr; + + close(pipefd[0]); + + orig_stderr = munit_replace_stderr(stderr_buf); + munit_test_runner_exec(runner, test, params, &report); + + /* Note that we don't restore stderr. This is so we can buffer + * things written to stderr later on (such as by + * asan/tsan/ubsan, valgrind, etc.) */ + close(orig_stderr); + + do { + write_res = + write(pipefd[1], ((munit_uint8_t *)(&report)) + bytes_written, + sizeof(report) - (size_t)bytes_written); + if (write_res < 0) { + if (stderr_buf != NULL) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe"); + } + exit(EXIT_FAILURE); + } + bytes_written += write_res; + } while ((size_t)bytes_written < sizeof(report)); + + if (stderr_buf != NULL) + fclose(stderr_buf); + close(pipefd[1]); + + exit(EXIT_SUCCESS); + } else if (fork_pid == -1) { + close(pipefd[0]); + close(pipefd[1]); + if (stderr_buf != NULL) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork"); + } + report.errored++; + result = MUNIT_ERROR; + } else { + close(pipefd[1]); + do { + read_res = read(pipefd[0], ((munit_uint8_t *)(&report)) + bytes_read, + sizeof(report) - (size_t)bytes_read); + if (read_res < 1) + break; + bytes_read += read_res; + } while (bytes_read < (ssize_t)sizeof(report)); + + changed_pid = waitpid(fork_pid, &status, 0); + + if (MUNIT_LIKELY(changed_pid == fork_pid) && + MUNIT_LIKELY(WIFEXITED(status))) { + if (bytes_read != sizeof(report)) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, + "child exited unexpectedly with status %d", + WEXITSTATUS(status)); + report.errored++; + } else if (WEXITSTATUS(status) != EXIT_SUCCESS) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, + "child exited with status %d", + WEXITSTATUS(status)); + report.errored++; + } + } else { + if (WIFSIGNALED(status)) { +# if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700) + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, + "child killed by signal %d (%s)", + WTERMSIG(status), strsignal(WTERMSIG(status))); +# else + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, + "child killed by signal %d", WTERMSIG(status)); +# endif + } else if (WIFSTOPPED(status)) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, + "child stopped by signal %d", WSTOPSIG(status)); + } + report.errored++; + } + + close(pipefd[0]); + waitpid(fork_pid, NULL, 0); + } + } else +#endif + { +#if !defined(MUNIT_NO_BUFFER) + const volatile int orig_stderr = munit_replace_stderr(stderr_buf); +#endif + +#if defined(MUNIT_THREAD_LOCAL) + if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) { + result = MUNIT_FAIL; + report.failed++; + } else { + munit_error_jmp_buf_valid = 1; + result = munit_test_runner_exec(runner, test, params, &report); + } +#else + result = munit_test_runner_exec(runner, test, params, &report); +#endif + +#if !defined(MUNIT_NO_BUFFER) + munit_restore_stderr(orig_stderr); +#endif + + /* Here just so that the label is used on Windows and we don't get + * a warning */ + goto print_result; + } + +print_result: + + fputs("[ ", MUNIT_OUTPUT_FILE); + if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) { + if (report.failed != 0 || report.errored != 0 || report.skipped != 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3'); + result = MUNIT_OK; + } else { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); + if (MUNIT_LIKELY(stderr_buf != NULL)) + munit_log_internal(MUNIT_LOG_ERROR, stderr_buf, + "Test marked TODO, but was successful."); + runner->report.failed++; + result = MUNIT_ERROR; + } + } else if (report.failed > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1'); + runner->report.failed++; + result = MUNIT_FAIL; + } else if (report.errored > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); + runner->report.errored++; + result = MUNIT_ERROR; + } else if (report.skipped > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3'); + runner->report.skipped++; + result = MUNIT_SKIP; + } else if (report.successful > 1) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); +#if defined(MUNIT_ENABLE_TIMING) + fputs(" ] [ ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful); + fputs(" / ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful); + fprintf(MUNIT_OUTPUT_FILE, + " CPU ]\n %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ", + ""); + munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); + fputs(" / ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); + fputs(" CPU", MUNIT_OUTPUT_FILE); +#endif + runner->report.successful++; + result = MUNIT_OK; + } else if (report.successful > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); +#if defined(MUNIT_ENABLE_TIMING) + fputs(" ] [ ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); + fputs(" / ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); + fputs(" CPU", MUNIT_OUTPUT_FILE); +#endif + runner->report.successful++; + result = MUNIT_OK; + } + fputs(" ]\n", MUNIT_OUTPUT_FILE); + + if (stderr_buf != NULL) { + if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) { + fflush(MUNIT_OUTPUT_FILE); + + rewind(stderr_buf); + munit_splice(fileno(stderr_buf), STDERR_FILENO); + + fflush(stderr); + } + + fclose(stderr_buf); + } +} + +static void munit_test_runner_run_test_wild(MunitTestRunner *runner, + const MunitTest *test, + const char *test_name, + MunitParameter *params, + MunitParameter *p) { + const MunitParameterEnum *pe; + char **values; + MunitParameter *next; + + for (pe = test->parameters; pe != NULL && pe->name != NULL; pe++) { + if (p->name == pe->name) + break; + } + + if (pe == NULL) + return; + + for (values = pe->values; *values != NULL; values++) { + next = p + 1; + p->value = *values; + if (next->name == NULL) { + munit_test_runner_run_test_with_params(runner, test, params); + } else { + munit_test_runner_run_test_wild(runner, test, test_name, params, next); + } + if (runner->fatal_failures && + (runner->report.failed != 0 || runner->report.errored != 0)) + break; + } +} + +/* Run a single test, with every combination of parameters + * requested. */ +static void munit_test_runner_run_test(MunitTestRunner *runner, + const MunitTest *test, + const char *prefix) { + char *test_name = + munit_maybe_concat(NULL, (char *)prefix, (char *)test->name); + /* The array of parameters to pass to + * munit_test_runner_run_test_with_params */ + MunitParameter *params = NULL; + size_t params_l = 0; + /* Wildcard parameters are parameters which have possible values + * specified in the test, but no specific value was passed to the + * CLI. That means we want to run the test once for every + * possible combination of parameter values or, if --single was + * passed to the CLI, a single time with a random set of + * parameters. */ + MunitParameter *wild_params = NULL; + size_t wild_params_l = 0; + const MunitParameterEnum *pe; + const MunitParameter *cli_p; + munit_bool filled; + unsigned int possible; + char **vals; + size_t first_wild; + const MunitParameter *wp; + int pidx; + + munit_rand_seed(runner->seed); + + fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s", + test_name); + + if (test->parameters == NULL) { + /* No parameters. Simple, nice. */ + munit_test_runner_run_test_with_params(runner, test, NULL); + } else { + fputc('\n', MUNIT_OUTPUT_FILE); + + for (pe = test->parameters; pe != NULL && pe->name != NULL; pe++) { + /* Did we received a value for this parameter from the CLI? */ + filled = 0; + for (cli_p = runner->parameters; cli_p != NULL && cli_p->name != NULL; + cli_p++) { + if (strcmp(cli_p->name, pe->name) == 0) { + if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, + cli_p->value) != MUNIT_OK)) + goto cleanup; + filled = 1; + break; + } + } + if (filled) + continue; + + /* Nothing from CLI, is the enum NULL/empty? We're not a + * fuzzer… */ + if (pe->values == NULL || pe->values[0] == NULL) + continue; + + /* If --single was passed to the CLI, choose a value from the + * list of possibilities randomly. */ + if (runner->single_parameter_mode) { + possible = 0; + for (vals = pe->values; *vals != NULL; vals++) + possible++; + /* We want the tests to be reproducible, even if you're only + * running a single test, but we don't want every test with + * the same number of parameters to choose the same parameter + * number, so use the test name as a primitive salt. */ + pidx = (int)munit_rand_at_most(munit_str_hash(test_name), possible - 1); + if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, + pe->values[pidx]) != MUNIT_OK)) + goto cleanup; + } else { + /* We want to try every permutation. Put in a placeholder + * entry, we'll iterate through them later. */ + if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params, + pe->name, NULL) != MUNIT_OK)) + goto cleanup; + } + } + + if (wild_params_l != 0) { + first_wild = params_l; + for (wp = wild_params; wp != NULL && wp->name != NULL; wp++) { + for (pe = test->parameters; + pe != NULL && pe->name != NULL && pe->values != NULL; pe++) { + if (strcmp(wp->name, pe->name) == 0) { + if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, + pe->name, + pe->values[0]) != MUNIT_OK)) + goto cleanup; + } + } + } + + munit_test_runner_run_test_wild(runner, test, test_name, params, + params + first_wild); + } else { + munit_test_runner_run_test_with_params(runner, test, params); + } + + cleanup: + free(params); + free(wild_params); + } + + munit_maybe_free_concat(test_name, prefix, test->name); +} + +/* Recurse through the suite and run all the tests. If a list of + * tests to run was provied on the command line, run only those + * tests. */ +static void munit_test_runner_run_suite(MunitTestRunner *runner, + const MunitSuite *suite, + const char *prefix) { + size_t pre_l; + char *pre = munit_maybe_concat(&pre_l, (char *)prefix, (char *)suite->prefix); + const MunitTest *test; + const char **test_name; + const MunitSuite *child_suite; + + /* Run the tests. */ + for (test = suite->tests; test != NULL && test->test != NULL; test++) { + if (runner->tests != NULL) { /* Specific tests were requested on the CLI */ + for (test_name = runner->tests; test_name != NULL && *test_name != NULL; + test_name++) { + if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) && + strncmp(test->name, *test_name + pre_l, + strlen(*test_name + pre_l)) == 0) { + munit_test_runner_run_test(runner, test, pre); + if (runner->fatal_failures && + (runner->report.failed != 0 || runner->report.errored != 0)) + goto cleanup; + } + } + } else { /* Run all tests */ + munit_test_runner_run_test(runner, test, pre); + } + } + + if (runner->fatal_failures && + (runner->report.failed != 0 || runner->report.errored != 0)) + goto cleanup; + + /* Run any child suites. */ + for (child_suite = suite->suites; + child_suite != NULL && child_suite->prefix != NULL; child_suite++) { + munit_test_runner_run_suite(runner, child_suite, pre); + } + +cleanup: + + munit_maybe_free_concat(pre, prefix, suite->prefix); +} + +static void munit_test_runner_run(MunitTestRunner *runner) { + munit_test_runner_run_suite(runner, runner->suite, NULL); +} + +static void munit_print_help(int argc, char *const *argv, void *user_data, + const MunitArgument arguments[]) { + const MunitArgument *arg; + (void)argc; + + printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]); + puts( + " --seed SEED\n" + " Value used to seed the PRNG. Must be a 32-bit integer in " + "decimal\n" + " notation with no separators (commas, decimals, spaces, " + "etc.), or\n" + " hexidecimal prefixed by \"0x\".\n" + " --iterations N\n" + " Run each test N times. 0 means the default number.\n" + " --param name value\n" + " A parameter key/value pair which will be passed to any test " + "with\n" + " takes a parameter of that name. If not provided, the test " + "will be\n" + " run once for each possible parameter value.\n" + " --list Write a list of all available tests.\n" + " --list-params\n" + " Write a list of all available tests and their possible " + "parameters.\n" + " --single Run each parameterized test in a single configuration " + "instead of\n" + " every possible combination\n" + " --log-visible debug|info|warning|error\n" + " --log-fatal debug|info|warning|error\n" + " Set the level at which messages of different severities are " + "visible,\n" + " or cause the test to terminate.\n" +#if !defined(MUNIT_NO_FORK) + " --no-fork Do not execute tests in a child process. If this option is " + "supplied\n" + " and a test crashes (including by failing an assertion), no " + "further\n" + " tests will be performed.\n" +#endif + " --fatal-failures\n" + " Stop executing tests as soon as a failure is found.\n" + " --show-stderr\n" + " Show data written to stderr by the tests, even if the test " + "succeeds.\n" + " --color auto|always|never\n" + " Colorize (or don't) the output.\n" + /* 12345678901234567890123456789012345678901234567890123456789012345678901234567890 + */ + " --help Print this help message and exit.\n"); +#if defined(MUNIT_NL_LANGINFO) + setlocale(LC_ALL, ""); + fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit", + stdout); +#else + puts("munit"); +#endif + printf(" %d.%d.%d\n" + "Full documentation at: https://nemequ.github.io/munit/\n", + (MUNIT_CURRENT_VERSION >> 16) & 0xff, + (MUNIT_CURRENT_VERSION >> 8) & 0xff, + (MUNIT_CURRENT_VERSION >> 0) & 0xff); + for (arg = arguments; arg != NULL && arg->name != NULL; arg++) + arg->write_help(arg, user_data); +} + +static const MunitArgument * +munit_arguments_find(const MunitArgument arguments[], const char *name) { + const MunitArgument *arg; + + for (arg = arguments; arg != NULL && arg->name != NULL; arg++) + if (strcmp(arg->name, name) == 0) + return arg; + + return NULL; +} + +static void munit_suite_list_tests(const MunitSuite *suite, + munit_bool show_params, const char *prefix) { + size_t pre_l; + char *pre = munit_maybe_concat(&pre_l, (char *)prefix, (char *)suite->prefix); + const MunitTest *test; + const MunitParameterEnum *params; + munit_bool first; + char **val; + const MunitSuite *child_suite; + + for (test = suite->tests; test != NULL && test->name != NULL; test++) { + if (pre != NULL) + fputs(pre, stdout); + puts(test->name); + + if (show_params) { + for (params = test->parameters; params != NULL && params->name != NULL; + params++) { + fprintf(stdout, " - %s: ", params->name); + if (params->values == NULL) { + puts("Any"); + } else { + first = 1; + for (val = params->values; *val != NULL; val++) { + if (!first) { + fputs(", ", stdout); + } else { + first = 0; + } + fputs(*val, stdout); + } + putc('\n', stdout); + } + } + } + } + + for (child_suite = suite->suites; + child_suite != NULL && child_suite->prefix != NULL; child_suite++) { + munit_suite_list_tests(child_suite, show_params, pre); + } + + munit_maybe_free_concat(pre, prefix, suite->prefix); +} + +static munit_bool munit_stream_supports_ansi(FILE *stream) { +#if !defined(_WIN32) + return isatty(fileno(stream)); +#else + +# if !defined(__MINGW32__) + size_t ansicon_size = 0; +# endif + + if (isatty(fileno(stream))) { +# if !defined(__MINGW32__) + getenv_s(&ansicon_size, NULL, 0, "ANSICON"); + return ansicon_size != 0; +# else + return getenv("ANSICON") != NULL; +# endif + } + return 0; +#endif +} + +int munit_suite_main_custom(const MunitSuite *suite, void *user_data, int argc, + char *const *argv, + const MunitArgument arguments[]) { + int result = EXIT_FAILURE; + MunitTestRunner runner; + size_t parameters_size = 0; + size_t tests_size = 0; + int arg; + + char *envptr; + unsigned long ts; + char *endptr; + unsigned long long iterations; + MunitLogLevel level; + const MunitArgument *argument; + const char **runner_tests; + unsigned int tests_run; + unsigned int tests_total; + + runner.prefix = NULL; + runner.suite = NULL; + runner.tests = NULL; + runner.seed = 0; + runner.iterations = 0; + runner.parameters = NULL; + runner.single_parameter_mode = 0; + runner.user_data = NULL; + + runner.report.successful = 0; + runner.report.skipped = 0; + runner.report.failed = 0; + runner.report.errored = 0; +#if defined(MUNIT_ENABLE_TIMING) + runner.report.cpu_clock = 0; + runner.report.wall_clock = 0; +#endif + + runner.colorize = 0; +#if !defined(_WIN32) + runner.fork = 1; +#else + runner.fork = 0; +#endif + runner.show_stderr = 0; + runner.fatal_failures = 0; + runner.suite = suite; + runner.user_data = user_data; + runner.seed = munit_rand_generate_seed(); + runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); + + for (arg = 1; arg < argc; arg++) { + if (strncmp("--", argv[arg], 2) == 0) { + if (strcmp("seed", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "%s requires an argument", argv[arg]); + goto cleanup; + } + + envptr = argv[arg + 1]; + ts = strtoul(argv[arg + 1], &envptr, 0); + if (*envptr != '\0' || ts > (~((munit_uint32_t)0U))) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "invalid value ('%s') passed to %s", + argv[arg + 1], argv[arg]); + goto cleanup; + } + runner.seed = (munit_uint32_t)ts; + + arg++; + } else if (strcmp("iterations", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "%s requires an argument", argv[arg]); + goto cleanup; + } + + endptr = argv[arg + 1]; + iterations = strtoul(argv[arg + 1], &endptr, 0); + if (*endptr != '\0' || iterations > UINT_MAX) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "invalid value ('%s') passed to %s", + argv[arg + 1], argv[arg]); + goto cleanup; + } + + runner.iterations = (unsigned int)iterations; + + arg++; + } else if (strcmp("param", argv[arg] + 2) == 0) { + if (arg + 2 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "%s requires two arguments", argv[arg]); + goto cleanup; + } + + runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) * + (parameters_size + 2)); + if (runner.parameters == NULL) { + munit_log_internal(MUNIT_LOG_ERROR, stderr, + "failed to allocate memory"); + goto cleanup; + } + runner.parameters[parameters_size].name = (char *)argv[arg + 1]; + runner.parameters[parameters_size].value = (char *)argv[arg + 2]; + parameters_size++; + runner.parameters[parameters_size].name = NULL; + runner.parameters[parameters_size].value = NULL; + arg += 2; + } else if (strcmp("color", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "%s requires an argument", argv[arg]); + goto cleanup; + } + + if (strcmp(argv[arg + 1], "always") == 0) + runner.colorize = 1; + else if (strcmp(argv[arg + 1], "never") == 0) + runner.colorize = 0; + else if (strcmp(argv[arg + 1], "auto") == 0) + runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); + else { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "invalid value ('%s') passed to %s", + argv[arg + 1], argv[arg]); + goto cleanup; + } + + arg++; + } else if (strcmp("help", argv[arg] + 2) == 0) { + munit_print_help(argc, argv, user_data, arguments); + result = EXIT_SUCCESS; + goto cleanup; + } else if (strcmp("single", argv[arg] + 2) == 0) { + runner.single_parameter_mode = 1; + } else if (strcmp("show-stderr", argv[arg] + 2) == 0) { + runner.show_stderr = 1; +#if !defined(_WIN32) + } else if (strcmp("no-fork", argv[arg] + 2) == 0) { + runner.fork = 0; +#endif + } else if (strcmp("fatal-failures", argv[arg] + 2) == 0) { + runner.fatal_failures = 1; + } else if (strcmp("log-visible", argv[arg] + 2) == 0 || + strcmp("log-fatal", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "%s requires an argument", argv[arg]); + goto cleanup; + } + + if (strcmp(argv[arg + 1], "debug") == 0) + level = MUNIT_LOG_DEBUG; + else if (strcmp(argv[arg + 1], "info") == 0) + level = MUNIT_LOG_INFO; + else if (strcmp(argv[arg + 1], "warning") == 0) + level = MUNIT_LOG_WARNING; + else if (strcmp(argv[arg + 1], "error") == 0) + level = MUNIT_LOG_ERROR; + else { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "invalid value ('%s') passed to %s", + argv[arg + 1], argv[arg]); + goto cleanup; + } + + if (strcmp("log-visible", argv[arg] + 2) == 0) + munit_log_level_visible = level; + else + munit_log_level_fatal = level; + + arg++; + } else if (strcmp("list", argv[arg] + 2) == 0) { + munit_suite_list_tests(suite, 0, NULL); + result = EXIT_SUCCESS; + goto cleanup; + } else if (strcmp("list-params", argv[arg] + 2) == 0) { + munit_suite_list_tests(suite, 1, NULL); + result = EXIT_SUCCESS; + goto cleanup; + } else { + argument = munit_arguments_find(arguments, argv[arg] + 2); + if (argument == NULL) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, + "unknown argument ('%s')", argv[arg]); + goto cleanup; + } + + if (!argument->parse_argument(suite, user_data, &arg, argc, argv)) + goto cleanup; + } + } else { + runner_tests = + realloc((void *)runner.tests, sizeof(char *) * (tests_size + 2)); + if (runner_tests == NULL) { + munit_log_internal(MUNIT_LOG_ERROR, stderr, + "failed to allocate memory"); + goto cleanup; + } + runner.tests = runner_tests; + runner.tests[tests_size++] = argv[arg]; + runner.tests[tests_size] = NULL; + } + } + + fflush(stderr); + fprintf(MUNIT_OUTPUT_FILE, + "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed); + + munit_test_runner_run(&runner); + + tests_run = + runner.report.successful + runner.report.failed + runner.report.errored; + tests_total = tests_run + runner.report.skipped; + if (tests_run == 0) { + fprintf(stderr, "No tests run, %d (100%%) skipped.\n", + runner.report.skipped); + } else { + fprintf(MUNIT_OUTPUT_FILE, + "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n", + runner.report.successful, tests_run, + (((double)runner.report.successful) / ((double)tests_run)) * 100.0, + runner.report.skipped, + (((double)runner.report.skipped) / ((double)tests_total)) * 100.0); + } + + if (runner.report.failed == 0 && runner.report.errored == 0) { + result = EXIT_SUCCESS; + } + +cleanup: + free(runner.parameters); + free((void *)runner.tests); + + return result; +} + +int munit_suite_main(const MunitSuite *suite, void *user_data, int argc, + char *const *argv) { + return munit_suite_main_custom(suite, user_data, argc, argv, NULL); +} + +static uint8_t hexchars[] = "0123456789abcdef"; + +static uint8_t *hexdump_addr(uint8_t *dest, size_t addr) { + size_t i; + uint8_t a; + + for (i = 0; i < 4; ++i) { + a = (addr >> (3 - i) * 8) & 0xff; + + *dest++ = hexchars[a >> 4]; + *dest++ = hexchars[a & 0xf]; + } + + return dest; +} + +static uint8_t *asciidump(uint8_t *dest, const uint8_t *data, size_t datalen) { + size_t i; + + *dest++ = '|'; + + for (i = 0; i < datalen; ++i) { + if (0x20 <= data[i] && data[i] <= 0x7e) { + *dest++ = data[i]; + } else { + *dest++ = '.'; + } + } + + *dest++ = '|'; + + return dest; +} + +static uint8_t *hexdump8(uint8_t *dest, const uint8_t *data, size_t datalen) { + size_t i; + + for (i = 0; i < datalen; ++i) { + *dest++ = hexchars[data[i] >> 4]; + *dest++ = hexchars[data[i] & 0xf]; + *dest++ = ' '; + } + + for (; i < 8; ++i) { + *dest++ = ' '; + *dest++ = ' '; + *dest++ = ' '; + } + + return dest; +} + +static uint8_t *hexdump16(uint8_t *dest, const uint8_t *data, size_t datalen) { + dest = hexdump8(dest, data, datalen < 8 ? datalen : 8); + *dest++ = ' '; + + if (datalen < 8) { + data = NULL; + datalen = 0; + } else { + data += 8; + datalen -= 8; + } + + dest = hexdump8(dest, data, datalen); + *dest++ = ' '; + + return dest; +} + +static uint8_t *hexdump_line(uint8_t *dest, const uint8_t *data, size_t datalen, + size_t addr) { + dest = hexdump_addr(dest, addr); + *dest++ = ' '; + *dest++ = ' '; + + dest = hexdump16(dest, data, datalen); + + dest = asciidump(dest, data, datalen); + + return dest; +} + +int munit_hexdump(FILE *fp, const void *data, size_t datalen) { + size_t offset = 0, n, len; + uint8_t buf[128], *p; + const uint8_t *s; + int repeated = 0; + + if (datalen == 0) { + return 0; + } + + for (; offset < datalen; offset += 16) { + n = datalen - offset; + s = (const uint8_t *)data + offset; + + if (n >= 16) { + n = 16; + + if (offset > 0) { + if (memcmp(s - 16, s, 16) == 0) { + if (repeated) { + continue; + } + + repeated = 1; + + if (fwrite("*\n", 1, 2, fp) < 2) { + return -1; + } + + continue; + } + + repeated = 0; + } + } + + p = hexdump_line(buf, s, n, offset); + *p++ = '\n'; + + len = (size_t)(p - buf); + + if (fwrite(buf, 1, len, fp) < len) { + return -1; + } + } + + p = hexdump_addr(buf, datalen); + *p++ = '\n'; + + len = (size_t)(p - buf); + + if (fwrite(buf, 1, len, fp) < len) { + return -1; + } + + return 0; +} + +int munit_hexdump_diff(FILE *fp, const void *a, size_t alen, const void *b, + size_t blen) { + size_t offset = 0, k, i, len, ncomp, maxlen, adoff = 0; + uint8_t buf[128], *p; + const uint8_t mk[2] = {'-', '+'}; + struct datasource { + const uint8_t *data; + size_t datalen; + const uint8_t *s; + size_t n; + } ds[] = {{a, alen, NULL, 0}, {b, blen, NULL, 0}}, *dp; + + maxlen = alen < blen ? blen : alen; + + for (; offset < maxlen; offset += 16) { + for (k = 0; k < 2; ++k) { + dp = &ds[k]; + + if (offset < dp->datalen) { + dp->s = (const uint8_t *)dp->data + offset; + dp->n = dp->datalen - offset; + + if (dp->n > 16) { + dp->n = 16; + } + } else { + dp->s = NULL; + dp->n = 0; + } + } + + if (ds[0].n == ds[1].n && memcmp(ds[0].s, ds[1].s, ds[0].n) == 0) { + continue; + } + + for (k = 0; k < 2; ++k) { + dp = &ds[k]; + + if (!dp->n) { + continue; + } + + p = buf; + *p++ = mk[k]; + *p++ = mk[k]; + *p++ = mk[k]; + *p++ = mk[k]; + + p = hexdump_line(p, dp->s, dp->n, offset); + *p++ = '\n'; + + len = (size_t)(p - buf); + + if (fwrite(buf, 1, len, fp) < len) { + return -1; + } + } + + if (!ds[0].n || !ds[1].n) { + continue; + } + + ncomp = ds[0].n < ds[1].n ? ds[0].n : ds[1].n; + + p = buf + 4 + 10; + + memset(buf, ' ', 4 + 78); + + for (i = 0; i < ncomp; ++i) { + if (ds[0].s[i] == ds[1].s[i]) { + *p++ = ' '; + *p++ = ' '; + } else { + adoff = 4 + 10 + 51 + i; + *(buf + adoff) = '^'; + + *p++ = '^'; + *p++ = '^'; + } + + *p++ = ' '; + + if (i == 7) { + *p++ = ' '; + } + } + + if (adoff) { + len = adoff + 1; + } else { + len = (size_t)(p - buf); + } + + buf[len++] = '\n'; + + if (fwrite(buf, 1, len, fp) < len) { + return -1; + } + } + + return 0; +} diff --git a/sys/contrib/openzfs/tests/unit/munit.h b/sys/contrib/openzfs/tests/unit/munit.h new file mode 100644 index 00000000000..b10d10ee0a5 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/munit.h @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: MIT +/* µnit Testing Framework + * Copyright (c) 2013-2017 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MUNIT_H +#define MUNIT_H + +#include +#include +#include +#include + +#define MUNIT_VERSION(major, minor, revision) \ + (((major) << 16) | ((minor) << 8) | (revision)) + +#define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1) + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +# define munit_int8_t __int8 +# define munit_uint8_t unsigned __int8 +# define munit_int16_t __int16 +# define munit_uint16_t unsigned __int16 +# define munit_int32_t __int32 +# define munit_uint32_t unsigned __int32 +# define munit_int64_t __int64 +# define munit_uint64_t unsigned __int64 +#else +# include +# define munit_int8_t int8_t +# define munit_uint8_t uint8_t +# define munit_int16_t int16_t +# define munit_uint16_t uint16_t +# define munit_int32_t int32_t +# define munit_uint32_t uint32_t +# define munit_int64_t int64_t +# define munit_uint64_t uint64_t +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1800) +# if !defined(PRIi8) +# define PRIi8 "i" +# endif +# if !defined(PRIi16) +# define PRIi16 "i" +# endif +# if !defined(PRIi32) +# define PRIi32 "i" +# endif +# if !defined(PRIi64) +# define PRIi64 "I64i" +# endif +# if !defined(PRId8) +# define PRId8 "d" +# endif +# if !defined(PRId16) +# define PRId16 "d" +# endif +# if !defined(PRId32) +# define PRId32 "d" +# endif +# if !defined(PRId64) +# define PRId64 "I64d" +# endif +# if !defined(PRIx8) +# define PRIx8 "x" +# endif +# if !defined(PRIx16) +# define PRIx16 "x" +# endif +# if !defined(PRIx32) +# define PRIx32 "x" +# endif +# if !defined(PRIx64) +# define PRIx64 "I64x" +# endif +# if !defined(PRIu8) +# define PRIu8 "u" +# endif +# if !defined(PRIu16) +# define PRIu16 "u" +# endif +# if !defined(PRIu32) +# define PRIu32 "u" +# endif +# if !defined(PRIu64) +# define PRIu64 "I64u" +# endif +#else +# include +#endif + +#if !defined(munit_bool) +# if defined(bool) +# define munit_bool bool +# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) +# define munit_bool _Bool +# else +# define munit_bool int +# endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) +# define MUNIT_LIKELY(expr) (__builtin_expect((expr), 1)) +# define MUNIT_UNLIKELY(expr) (__builtin_expect((expr), 0)) +# define MUNIT_UNUSED __attribute__((__unused__)) +#else +# define MUNIT_LIKELY(expr) (expr) +# define MUNIT_UNLIKELY(expr) (expr) +# define MUNIT_UNUSED +#endif + +#if !defined(_WIN32) +# define MUNIT_SIZE_MODIFIER "z" +# define MUNIT_CHAR_MODIFIER "hh" +# define MUNIT_SHORT_MODIFIER "h" +#else +# if defined(_M_X64) || defined(__amd64__) +# define MUNIT_SIZE_MODIFIER "I64" +# else +# define MUNIT_SIZE_MODIFIER "" +# endif +# define MUNIT_CHAR_MODIFIER "" +# define MUNIT_SHORT_MODIFIER "" +#endif + +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +# define MUNIT_NO_RETURN _Noreturn +#elif defined(__GNUC__) +# define MUNIT_NO_RETURN __attribute__((__noreturn__)) +#elif defined(_MSC_VER) +# define MUNIT_NO_RETURN __declspec(noreturn) +#else +# define MUNIT_NO_RETURN +#endif + +#if defined(_MSC_VER) && (_MSC_VER >= 1500) +# define MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + __pragma(warning(push)) __pragma(warning(disable : 4127)) +# define MUNIT_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop)) +#else +# define MUNIT_PUSH_DISABLE_MSVC_C4127_ +# define MUNIT_POP_DISABLE_MSVC_C4127_ +#endif + +typedef enum { + MUNIT_LOG_DEBUG, + MUNIT_LOG_INFO, + MUNIT_LOG_WARNING, + MUNIT_LOG_ERROR +} MunitLogLevel; + +#if defined(__GNUC__) && !defined(__MINGW32__) +# define MUNIT_PRINTF(string_index, first_to_check) \ + __attribute__((format(printf, string_index, first_to_check))) +#else +# define MUNIT_PRINTF(string_index, first_to_check) +#endif + +MUNIT_PRINTF(4, 5) +void munit_logf_ex(MunitLogLevel level, const char *filename, int line, + const char *format, ...); + +#define munit_logf(level, format, ...) \ + munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__) + +#define munit_log(level, msg) munit_logf(level, "%s", msg) + +MUNIT_NO_RETURN +MUNIT_PRINTF(3, 4) +void munit_errorf_ex(const char *filename, int line, const char *format, ...); + +#define munit_errorf(format, ...) \ + munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__) + +#define munit_error(msg) munit_errorf("%s", msg) + +#define munit_assert(expr) \ + do { \ + if (!MUNIT_LIKELY(expr)) { \ + munit_error("assertion failed: " #expr); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_true(expr) \ + do { \ + if (!MUNIT_LIKELY(expr)) { \ + munit_error("assertion failed: " #expr " is not true"); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_false(expr) \ + do { \ + if (!MUNIT_LIKELY(!(expr))) { \ + munit_error("assertion failed: " #expr " is not false"); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ + do { \ + T munit_tmp_a_ = (a); \ + T munit_tmp_b_ = (b); \ + if (!(munit_tmp_a_ op munit_tmp_b_)) { \ + munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix \ + " %s " prefix "%" fmt suffix ")", \ + #a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_type(T, fmt, a, op, b) \ + munit_assert_type_full("", "", T, fmt, a, op, b) + +#define munit_assert_char(a, op, b) \ + munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a, \ + op, b) +#define munit_assert_uchar(a, op, b) \ + munit_assert_type_full("'\\x", "'", unsigned char, \ + "02" MUNIT_CHAR_MODIFIER "x", a, op, b) +#define munit_assert_short(a, op, b) \ + munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b) +#define munit_assert_ushort(a, op, b) \ + munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b) +#define munit_assert_int(a, op, b) munit_assert_type(int, "d", a, op, b) +#define munit_assert_uint(a, op, b) \ + munit_assert_type(unsigned int, "u", a, op, b) +#define munit_assert_long(a, op, b) munit_assert_type(long int, "ld", a, op, b) +#define munit_assert_ulong(a, op, b) \ + munit_assert_type(unsigned long int, "lu", a, op, b) +#define munit_assert_llong(a, op, b) \ + munit_assert_type(long long int, "lld", a, op, b) +#define munit_assert_ullong(a, op, b) \ + munit_assert_type(unsigned long long int, "llu", a, op, b) + +#define munit_assert_size(a, op, b) \ + munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b) +#define munit_assert_ssize(a, op, b) \ + munit_assert_type(ssize_t, MUNIT_SIZE_MODIFIER "d", a, op, b) + +#define munit_assert_float(a, op, b) munit_assert_type(float, "f", a, op, b) +#define munit_assert_double(a, op, b) munit_assert_type(double, "g", a, op, b) +#define munit_assert_ptr(a, op, b) \ + munit_assert_type(const void *, "p", a, op, b) + +#define munit_assert_int8(a, op, b) \ + munit_assert_type(munit_int8_t, PRIi8, a, op, b) +#define munit_assert_uint8(a, op, b) \ + munit_assert_type(munit_uint8_t, PRIu8, a, op, b) +#define munit_assert_int16(a, op, b) \ + munit_assert_type(munit_int16_t, PRIi16, a, op, b) +#define munit_assert_uint16(a, op, b) \ + munit_assert_type(munit_uint16_t, PRIu16, a, op, b) +#define munit_assert_int32(a, op, b) \ + munit_assert_type(munit_int32_t, PRIi32, a, op, b) +#define munit_assert_uint32(a, op, b) \ + munit_assert_type(munit_uint32_t, PRIu32, a, op, b) +#define munit_assert_int64(a, op, b) \ + munit_assert_type(munit_int64_t, PRIi64, a, op, b) +#define munit_assert_uint64(a, op, b) \ + munit_assert_type(munit_uint64_t, PRIu64, a, op, b) + +#define munit_assert_ptrdiff(a, op, b) \ + munit_assert_type(ptrdiff_t, "td", a, op, b) + +#define munit_assert_enum(T, a, op, b) munit_assert_type(T, "d", a, op, b) + +#define munit_assert_double_equal(a, b, precision) \ + do { \ + const double munit_tmp_a_ = (a); \ + const double munit_tmp_b_ = (b); \ + const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0) \ + ? -(munit_tmp_a_ - munit_tmp_b_) \ + : (munit_tmp_a_ - munit_tmp_b_); \ + if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) { \ + munit_errorf("assertion failed: %s == %s (%0." #precision \ + "g == %0." #precision "g)", \ + #a, #b, munit_tmp_a_, munit_tmp_b_); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#include +#define munit_assert_string_equal(a, b) \ + do { \ + const char *munit_tmp_a_ = (a); \ + const char *munit_tmp_b_ = (b); \ + if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) { \ + munit_hexdump_diff(stderr, munit_tmp_a_, strlen(munit_tmp_a_), \ + munit_tmp_b_, strlen(munit_tmp_b_)); \ + munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", #a, \ + #b, munit_tmp_a_, munit_tmp_b_); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_string_not_equal(a, b) \ + do { \ + const char *munit_tmp_a_ = (a); \ + const char *munit_tmp_b_ = (b); \ + if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) { \ + munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", #a, \ + #b, munit_tmp_a_, munit_tmp_b_); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_memory_equal(size, a, b) \ + do { \ + const unsigned char *munit_tmp_a_ = (const unsigned char *)(a); \ + const unsigned char *munit_tmp_b_ = (const unsigned char *)(b); \ + const size_t munit_tmp_size_ = (size); \ + if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != \ + 0) { \ + size_t munit_tmp_pos_; \ + for (munit_tmp_pos_ = 0; munit_tmp_pos_ < munit_tmp_size_; \ + munit_tmp_pos_++) { \ + if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) { \ + munit_hexdump_diff(stderr, munit_tmp_a_, size, munit_tmp_b_, size); \ + munit_errorf("assertion failed: memory %s == %s, at offset " \ + "%" MUNIT_SIZE_MODIFIER "u", \ + #a, #b, munit_tmp_pos_); \ + break; \ + } \ + } \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_memn_equal(a, a_size, b, b_size) \ + do { \ + const unsigned char *munit_tmp_a_ = (const unsigned char *)(a); \ + const unsigned char *munit_tmp_b_ = (const unsigned char *)(b); \ + const size_t munit_tmp_a_size_ = (a_size); \ + const size_t munit_tmp_b_size_ = (b_size); \ + if (MUNIT_UNLIKELY(munit_tmp_a_size_ != munit_tmp_b_size_) || \ + MUNIT_UNLIKELY(munit_tmp_a_size_ && memcmp(munit_tmp_a_, munit_tmp_b_, \ + munit_tmp_a_size_)) != 0) { \ + munit_hexdump_diff(stderr, munit_tmp_a_, munit_tmp_a_size_, \ + munit_tmp_b_, munit_tmp_b_size_); \ + munit_errorf("assertion failed: memory %s == %s", #a, #b); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_memory_not_equal(size, a, b) \ + do { \ + const unsigned char *munit_tmp_a_ = (const unsigned char *)(a); \ + const unsigned char *munit_tmp_b_ = (const unsigned char *)(b); \ + const size_t munit_tmp_size_ = (size); \ + if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == \ + 0) { \ + munit_errorf("assertion failed: memory %s != %s (%zu bytes)", #a, #b, \ + munit_tmp_size_); \ + } \ + MUNIT_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) MUNIT_POP_DISABLE_MSVC_C4127_ + +#define munit_assert_ptr_equal(a, b) munit_assert_ptr(a, ==, b) +#define munit_assert_ptr_not_equal(a, b) munit_assert_ptr(a, !=, b) +#define munit_assert_null(ptr) munit_assert_ptr(ptr, ==, NULL) +#define munit_assert_not_null(ptr) munit_assert_ptr(ptr, !=, NULL) +#define munit_assert_ptr_null(ptr) munit_assert_ptr(ptr, ==, NULL) +#define munit_assert_ptr_not_null(ptr) munit_assert_ptr(ptr, !=, NULL) + +/*** Memory allocation ***/ + +void *munit_malloc_ex(const char *filename, int line, size_t size); + +#define munit_malloc(size) munit_malloc_ex(__FILE__, __LINE__, (size)) + +#define munit_new(type) ((type *)munit_malloc(sizeof(type))) + +#define munit_calloc(nmemb, size) munit_malloc((nmemb) * (size)) + +#define munit_newa(type, nmemb) ((type *)munit_calloc((nmemb), sizeof(type))) + +/*** Random number generation ***/ + +void munit_rand_seed(munit_uint32_t seed); +munit_uint32_t munit_rand_uint32(void); +int munit_rand_int_range(int min, int max); +double munit_rand_double(void); +void munit_rand_memory(size_t size, munit_uint8_t *buffer); + +/*** Tests and Suites ***/ + +typedef enum { + /* Test successful */ + MUNIT_OK, + /* Test failed */ + MUNIT_FAIL, + /* Test was skipped */ + MUNIT_SKIP, + /* Test failed due to circumstances not intended to be tested + * (things like network errors, invalid parameter value, failure to + * allocate memory in the test harness, etc.). */ + MUNIT_ERROR +} MunitResult; + +typedef struct { + char *name; + char **values; +} MunitParameterEnum; + +typedef struct { + char *name; + char *value; +} MunitParameter; + +const char *munit_parameters_get(const MunitParameter params[], + const char *key); + +typedef enum { + MUNIT_TEST_OPTION_NONE = 0, + MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0, + MUNIT_TEST_OPTION_TODO = 1 << 1 +} MunitTestOptions; + +typedef MunitResult (*MunitTestFunc)(const MunitParameter params[], + void *user_data_or_fixture); +typedef void *(*MunitTestSetup)(const MunitParameter params[], void *user_data); +typedef void (*MunitTestTearDown)(void *fixture); + +typedef struct { + const char *name; + MunitTestFunc test; + MunitTestSetup setup; + MunitTestTearDown tear_down; + MunitTestOptions options; + MunitParameterEnum *parameters; +} MunitTest; + +typedef enum { MUNIT_SUITE_OPTION_NONE = 0 } MunitSuiteOptions; + +typedef struct MunitSuite_ MunitSuite; + +struct MunitSuite_ { + const char *prefix; + const MunitTest *tests; + const MunitSuite *suites; + unsigned int iterations; + MunitSuiteOptions options; +}; + +int munit_suite_main(const MunitSuite *suite, void *user_data, int argc, + char *const *argv); + +/* Note: I'm not very happy with this API; it's likely to change if I + * figure out something better. Suggestions welcome. */ + +typedef struct MunitArgument_ MunitArgument; + +struct MunitArgument_ { + char *name; + munit_bool (*parse_argument)(const MunitSuite *suite, void *user_data, + int *arg, int argc, char *const *argv); + void (*write_help)(const MunitArgument *argument, void *user_data); +}; + +int munit_suite_main_custom(const MunitSuite *suite, void *user_data, int argc, + char *const *argv, const MunitArgument arguments[]); + +#if defined(MUNIT_ENABLE_ASSERT_ALIASES) + +# define assert_true(expr) munit_assert_true(expr) +# define assert_false(expr) munit_assert_false(expr) +# define assert_char(a, op, b) munit_assert_char(a, op, b) +# define assert_uchar(a, op, b) munit_assert_uchar(a, op, b) +# define assert_short(a, op, b) munit_assert_short(a, op, b) +# define assert_ushort(a, op, b) munit_assert_ushort(a, op, b) +# define assert_int(a, op, b) munit_assert_int(a, op, b) +# define assert_uint(a, op, b) munit_assert_uint(a, op, b) +# define assert_long(a, op, b) munit_assert_long(a, op, b) +# define assert_ulong(a, op, b) munit_assert_ulong(a, op, b) +# define assert_llong(a, op, b) munit_assert_llong(a, op, b) +# define assert_ullong(a, op, b) munit_assert_ullong(a, op, b) +# define assert_size(a, op, b) munit_assert_size(a, op, b) +# define assert_ssize(a, op, b) munit_assert_ssize(a, op, b) +# define assert_float(a, op, b) munit_assert_float(a, op, b) +# define assert_double(a, op, b) munit_assert_double(a, op, b) +# define assert_ptr(a, op, b) munit_assert_ptr(a, op, b) + +# define assert_int8(a, op, b) munit_assert_int8(a, op, b) +# define assert_uint8(a, op, b) munit_assert_uint8(a, op, b) +# define assert_int16(a, op, b) munit_assert_int16(a, op, b) +# define assert_uint16(a, op, b) munit_assert_uint16(a, op, b) +# define assert_int32(a, op, b) munit_assert_int32(a, op, b) +# define assert_uint32(a, op, b) munit_assert_uint32(a, op, b) +# define assert_int64(a, op, b) munit_assert_int64(a, op, b) +# define assert_uint64(a, op, b) munit_assert_uint64(a, op, b) + +# define assert_ptrdiff(a, op, b) munit_assert_ptrdiff(a, op, b) + +# define assert_enum(T, a, op, b) munit_assert_enum(T, a, op, b) + +# define assert_double_equal(a, b, precision) \ + munit_assert_double_equal(a, b, precision) +# define assert_string_equal(a, b) munit_assert_string_equal(a, b) +# define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b) +# define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b) +# define assert_memn_equal(a, a_size, b, b_size) \ + munit_assert_memn_equal(a, a_size, b, b_size) +# define assert_memory_not_equal(size, a, b) \ + munit_assert_memory_not_equal(size, a, b) +# define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b) +# define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b) +# define assert_ptr_null(ptr) munit_assert_null_equal(ptr) +# define assert_ptr_not_null(ptr) munit_assert_not_null(ptr) + +# define assert_null(ptr) munit_assert_null(ptr) +# define assert_not_null(ptr) munit_assert_not_null(ptr) + +#endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */ + +#define munit_void_test_decl(func) \ + void func(void); \ + \ + static inline MunitResult wrap_##func(const MunitParameter params[], \ + void *fixture) { \ + (void)params; \ + (void)fixture; \ + \ + func(); \ + return MUNIT_OK; \ + } + +#define munit_void_test(func) \ + {"/" #func, wrap_##func, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL} + +#define munit_test_end() {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL} + +int munit_hexdump(FILE *fp, const void *data, size_t datalen); + +int munit_hexdump_diff(FILE *fp, const void *a, size_t alen, const void *b, + size_t blen); + +#if defined(__cplusplus) +} +#endif + +#endif /* !defined(MUNIT_H) */ + +#if defined(MUNIT_ENABLE_ASSERT_ALIASES) +#if defined(assert) +# undef assert +#endif +#define assert(expr) munit_assert(expr) +#endif diff --git a/sys/contrib/openzfs/tests/unit/test_zap.c b/sys/contrib/openzfs/tests/unit/test_zap.c new file mode 100644 index 00000000000..c64de7d75c4 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/test_zap.c @@ -0,0 +1,1170 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2026, TrueNAS. + */ + +#include + +#include +#include +typedef struct spa spa_t; /* forward decl for zap_impl.h */ +#include + +#include "mock_dmu.h" +#include "unit.h" + +/* ========== */ + +/* + * Normally defined and initialised in arc.c. We define and initialise it + * ourselves here so this mock can be linked without arc.c. + */ +uint64_t zfs_crc64_table[256]; + +static void +mock_crc64_init(void) +{ + for (int i = 0; i < 256; i++) { + uint64_t ct = i; + for (int j = 8; j > 0; j--) + ct = (ct >> 1) ^ (-(ct & 1) & ZFS_CRC64_POLY); + zfs_crc64_table[i] = ct; + } +} + +/* Misc utility functions. */ + +#define rd64(ptr, off) (*(uint64_t *)((const char *)(ptr) + (off))) + +/* ========== */ + +/* ZAP-specific mocks and other test helpers. */ + +/* Create a microzap backed by a mock dnode. */ +static dnode_t * +mock_zap_create_microzap(void) { + /* + * We use DMU_OTN_ZAP_DATA so that DMU_OT_BYTESWAP() returns + * DMU_BSWAP_ZAP without consulting dmu_ot[], which is not currently + * provided in the mock. + */ + mock_dnode_t *mdn = mock_dnode_create(512, DMU_OTN_ZAP_DATA); + dnode_t *dn = (dnode_t *)mdn; + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + mzap_create_impl(dn, 0, 0, tx); + mock_tx_destroy((mock_dmu_tx_t *)tx); + return (dn); +} + +/* Create a fatzap backed by a mock dnode. */ +static dnode_t * +mock_zap_create_fatzap(void) +{ + /* + * We can only create microzaps directly. They only take u64s as a + * value, so we add a u16 to trigger an upgrade to fatzap. + */ + dnode_t *dn = mock_zap_create_microzap(); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + uint16_t upgrade = 0; + zap_add_by_dnode(dn, "_upgrade", sizeof (uint16_t), 1, &upgrade, tx); + zap_remove_by_dnode(dn, "_upgrade", tx); + mock_tx_destroy((mock_dmu_tx_t *)tx); + return (dn); +} + +static bool +mock_zap_is_microzap(dnode_t *dn) +{ + /* check block 0 has a microzap header */ + const void *blk = mock_dnode_block_data((mock_dnode_t *)dn, 0); + return (rd64(blk, 0) == ZBT_MICRO); +} + +static bool +mock_zap_is_fatzap(dnode_t *dn) +{ + /* check block 0 has a fatzap header */ + const void *blk = mock_dnode_block_data((mock_dnode_t *)dn, 0); + return (rd64(blk, 0) == ZBT_HEADER && rd64(blk, 8) == ZAP_MAGIC); +} + +static void +mock_zap_destroy(dnode_t *dn) +{ + mock_dnode_t *mdn = (mock_dnode_t *)dn; + unit_eq(mock_dnode_refcount(mdn), 1); + mock_dnode_destroy(mdn); +} + +/* Create a ZAP of the type named in the given test params. */ +static dnode_t * +mock_zap_create_params(const MunitParameter params[], const char *key) { + const char *type = munit_parameters_get(params, key); + if (type == NULL) + munit_error("mock_zap_create_params: missing type param"); + else if (strcmp(type, "micro") == 0) + return (mock_zap_create_microzap()); + else if (strcmp(type, "fat") == 0) + return (mock_zap_create_fatzap()); + else + munit_errorf("mock_zap_create_params: invalid type '%s'", type); + __builtin_unreachable(); +} + +/* + * Confirm the stored ZAP is of the type named in the given test params. This + * is useful for sanity checks within tests that a ZAP wasn't unexpectedly + * upgraded during the test. + */ +static bool +mock_zap_is_params(dnode_t *dn, const MunitParameter params[], + const char *key) +{ + const char *type = munit_parameters_get(params, key); + if (type == NULL) + munit_error("mock_zap_is_params: missing type param"); + else if (strcmp(type, "micro") == 0) + return (mock_zap_is_microzap(dn)); + else if (strcmp(type, "fat") == 0) + return (mock_zap_is_fatzap(dn)); + else + munit_errorf("mock_zap_is_params: invalid type '%s'", type); + __builtin_unreachable(); +} + +/* ========== */ + +/* + * Sanity checks for mock ZAPs. Ensures that the mock_zap_create_* functions + * really do create the right kind of ZAPs, since many of the tests need to + * run against both kinds to confirm that they all work the same way. + */ +static MunitResult +test_mock_microzap_sanity(const MunitParameter params[], void *data) +{ + (void) params, (void) data; + + dnode_t *dn = mock_zap_create_microzap(); + unit_true(mock_zap_is_microzap(dn)); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +static MunitResult +test_mock_fatzap_sanity(const MunitParameter params[], void *data) +{ + (void) params, (void) data; + + dnode_t *dn = mock_zap_create_fatzap(); + unit_true(mock_zap_is_fatzap(dn)); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* + * A simple add, lookup and remove test. Confirms basic operation. These are + * tested together simply because all other tests rely on these primitives. + */ +static MunitResult +test_zap_basic(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* Insert a few entries. */ + uint64_t val42 = 42; + uint64_t val99 = 99; + uint64_t val0 = 0; + + unit_ok(zap_add_by_dnode(dn, "hello", + sizeof (uint64_t), 1, &val42, tx)); + unit_ok(zap_add_by_dnode(dn, "world", + sizeof (uint64_t), 1, &val99, tx)); + unit_ok(zap_add_by_dnode(dn, "zero", + sizeof (uint64_t), 1, &val0, tx)); + + /* Lookup each entry. */ + uint64_t result = 0; + unit_ok(zap_lookup_by_dnode(dn, "hello", + sizeof (uint64_t), 1, &result)); + unit_eq(result, 42); + + unit_ok(zap_lookup_by_dnode(dn, "world", + sizeof (uint64_t), 1, &result)); + unit_eq(result, 99); + + unit_ok(zap_lookup_by_dnode(dn, "zero", + sizeof (uint64_t), 1, &result)); + unit_eq(result, 0); + + /* Non-existent key should return ENOENT. */ + unit_err(zap_lookup_by_dnode(dn, "nope", + sizeof (uint64_t), 1, &result), ENOENT); + + /* Removing an entry should make it impossible to look up. */ + unit_ok(zap_remove_by_dnode(dn, "world", tx)); + unit_err(zap_lookup_by_dnode(dn, "world", + sizeof (uint64_t), 1, &result), ENOENT); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* + * "Core" ZAP API tests. Covers the most basic functionality upon which which + * everything else is built. + * + * Note that to avoid microzap upgrade here, we only short keys and + * single-uint64 values. + */ + +/* zap_add: add new items. */ +static MunitResult +test_zap_add(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* A key added can be found by that name. */ + uint64_t va = 1, var = 0; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var)); + unit_eq(var, 1); + + /* Another key added can be found by that name. */ + uint64_t vb = 2, vbr = 0; + unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &vb, tx)); + unit_ok(zap_lookup_by_dnode(dn, "b", sizeof (uint64_t), 1, &vbr)); + unit_eq(vbr, 2); + + /* The first key is still findable with the right value. */ + var = 0; + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var)); + unit_eq(var, 1); + + /* Adding the key again fails. */ + unit_err(zap_add_by_dnode(dn, "a", + sizeof (uint64_t), 1, &va, tx), EEXIST); + + /* Adding the key with a different value still fails. */ + va = 2; + unit_err(zap_add_by_dnode(dn, "a", + sizeof (uint64_t), 1, &va, tx), EEXIST); + + /* And is still findable with the original value. */ + var = 0; + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var)); + unit_eq(var, 1); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_update: add new or replace existing items. */ +static MunitResult +test_zap_update(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* Update on a non-existent key inserts it. */ + uint64_t va = 1, var = 0; + unit_ok(zap_update_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var)); + unit_eq(var, 1); + + /* Update on an existing key replaces it without error. */ + va = 2; + unit_ok(zap_update_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var)); + unit_eq(var, 2); + + /* Count should still be 1 (no duplicate was created). */ + uint64_t count = 0; + unit_ok(zap_count_by_dnode(dn, &count)); + unit_eq(count, 1); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_remove: remove existing items. */ +static MunitResult +test_zap_remove(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* Removing a non-existing key fails. */ + unit_err(zap_remove_by_dnode(dn, "a", tx), ENOENT); + + /* Adding two keys. */ + uint64_t va = 1, vb = 2; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &va, tx)); + unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &vb, tx)); + + /* Remove an existing key succeeds. */ + unit_ok(zap_remove_by_dnode(dn, "a", tx)); + + /* After removing, looking up removed key fails. */ + uint64_t var = 0; + unit_err( + zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &var), ENOENT); + + /* Looking up the other key succeeds, and has the correct value. */ + uint64_t vbr = 0; + unit_ok(zap_lookup_by_dnode(dn, "b", sizeof (uint64_t), 1, &vbr)); + unit_eq(vbr, 2); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_count: number of entries, typically without lookup or traversal. */ +static MunitResult +test_zap_count(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* A new ZAP has zero entries. */ + uint64_t count = 0; + unit_ok(zap_count_by_dnode(dn, &count)); + unit_eq(count, 0); + + /* Adding two keys bumps the count to 2. */ + uint64_t v = 1; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_count_by_dnode(dn, &count)); + unit_eq(count, 2); + + /* Removing a key reduces the count. */ + unit_ok(zap_remove_by_dnode(dn, "a", tx)); + unit_ok(zap_count_by_dnode(dn, &count)); + unit_eq(count, 1); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_contains: existence check without reading the value. */ +static MunitResult +test_zap_contains(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + uint64_t v = 1; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_contains_by_dnode(dn, "a")); + unit_err(zap_contains_by_dnode(dn, "b"), ENOENT); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_length: item metadata without reading the value. */ +static MunitResult +test_zap_length(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* uint64: integer_size=8, num_integers=1. */ + uint64_t v = 42; + unit_ok(zap_add_by_dnode(dn, "u64", + sizeof (uint64_t), 1, &v, tx)); + + uint64_t isz = 0, nint = 0; + unit_ok(zap_length_by_dnode(dn, "u64", &isz, &nint)); + unit_eq(isz, 8); + unit_eq(nint, 1); + + /* Missing key returns ENOENT. */ + unit_err(zap_length_by_dnode(dn, "nope", &isz, &nint), ENOENT); + + /* Either output pointer may be NULL. */ + isz = 0; nint = 0; + unit_ok(zap_length_by_dnode(dn, "u64", NULL, &nint)); + unit_ok(zap_length_by_dnode(dn, "u64", &isz, NULL)); + unit_eq(isz, 8); + unit_eq(nint, 1); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_increment: add integer value to existing integer */ +static MunitResult +test_zap_increment(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + uint64_t r = 0; + + /* Increment a missing key creates it with that value. */ + unit_ok(zap_increment_by_dnode(dn, "a", 5, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r)); + unit_eq(r, 5); + + /* Further increments accumulate. */ + unit_ok(zap_increment_by_dnode(dn, "a", 3, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r)); + unit_eq(r, 8); + + /* Decrement works. */ + unit_ok(zap_increment_by_dnode(dn, "a", -2, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r)); + unit_eq(r, 6); + + /* Zero delta leaves it unchanged. */ + r = 0; + unit_ok(zap_increment_by_dnode(dn, "a", 0, tx)); + unit_ok(zap_lookup_by_dnode(dn, "a", sizeof (uint64_t), 1, &r)); + unit_eq(r, 6); + + /* Decrementing to zero removes the entry. */ + unit_ok(zap_increment_by_dnode(dn, "a", -6, tx)); + unit_err(zap_lookup_by_dnode(dn, "a", + sizeof (uint64_t), 1, &r), ENOENT); + + /* Delta of zero is a no-op even for a missing key. */ + unit_ok(zap_increment_by_dnode(dn, "a", 0, tx)); + unit_err(zap_lookup_by_dnode(dn, "a", + sizeof (uint64_t), 1, &r), ENOENT); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* + * zap_add_int/zap_remove_int/zap_lookup_int: single uint64_t value, + * stringified to form the key. + */ +static MunitResult +test_zap_int(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* Add some ints. */ + unit_ok(zap_add_int_by_dnode(dn, 5, tx)); + unit_ok(zap_add_int_by_dnode(dn, 17, tx)); + + /* Confirm they're there. */ + unit_ok(zap_lookup_int_by_dnode(dn, 17)); + unit_ok(zap_lookup_int_by_dnode(dn, 5)); + + /* But not something we didn't add. */ + unit_err(zap_lookup_int_by_dnode(dn, 23), ENOENT); + + /* Adding something that already exists fails. */ + unit_err(zap_add_int_by_dnode(dn, 17, tx), EEXIST); + + /* Removing it works, and then it can't be found. */ + unit_ok(zap_remove_int_by_dnode(dn, 17, tx)); + unit_err(zap_lookup_int_by_dnode(dn, 17), ENOENT); + + /* Add it can be added back. */ + unit_ok(zap_add_int_by_dnode(dn, 17, tx)); + unit_ok(zap_lookup_int_by_dnode(dn, 17)); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_*_int_key: like zap_*_int, but with separate value. */ +static MunitResult +test_zap_int_keys(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* Add some ints. */ + unit_ok(zap_add_int_key_by_dnode(dn, 5, 17, tx)); + unit_ok(zap_add_int_key_by_dnode(dn, 23, 35, tx)); + + /* Confirm they're there. */ + uint64_t r = 0; + unit_ok(zap_lookup_int_key_by_dnode(dn, 5, &r)); + unit_eq(r, 17); + unit_ok(zap_lookup_int_key_by_dnode(dn, 23, &r)); + unit_eq(r, 35); + + /* But not something we didn't add. */ + unit_err(zap_lookup_int_key_by_dnode(dn, 79, &r), ENOENT); + + /* Adding something that already exists fails. */ + unit_err(zap_add_int_key_by_dnode(dn, 23, 51, tx), EEXIST); + + /* Updating it works though. */ + unit_ok(zap_update_int_key_by_dnode(dn, 23, 51, tx)); + + /* Removing it works, and then it can't be found. */ + unit_ok(zap_remove_int_by_dnode(dn, 23, tx)); + unit_err(zap_lookup_int_key_by_dnode(dn, 23, &r), ENOENT); + + /* Add it can be added back. */ + unit_ok(zap_add_int_key_by_dnode(dn, 23, 11, tx)); + unit_ok(zap_lookup_int_key_by_dnode(dn, 23, &r)); + unit_eq(r, 11); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* + * Separate stats tests for each ZAP type, since they are about internals and + * so can and will produce different results. + */ + +static MunitResult +test_microzap_stats(const MunitParameter params[], void *data) +{ + (void) params; (void) data; + + dnode_t *dn = mock_zap_create_microzap(); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + zap_stats_t zs; + uint64_t v = 1; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_get_stats_by_dnode(dn, &zs)); + + /* We added two entries. */ + unit_eq(zs.zs_num_entries, 2); + + /* MicroZAP is always a single block. */ + unit_eq(zs.zs_num_blocks, 1); + + /* Blocksize matches what we passed to mock_dnode_create(). */ + unit_eq(zs.zs_blocksize, 512); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_microzap(dn)); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +static MunitResult +test_fatzap_stats(const MunitParameter params[], void *data) +{ + (void) params; (void) data; + + dnode_t *dn = mock_zap_create_fatzap(); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + zap_stats_t zs; + uint64_t v = 1; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_get_stats_by_dnode(dn, &zs)); + + /* We added two entries. */ + unit_eq(zs.zs_num_entries, 2); + + /* One header block, one leaf block. */ + unit_eq(zs.zs_num_blocks, 2); + + /* FatZAP block size set by tuneable. */ + unit_eq(zs.zs_blocksize, 1 << fzap_default_block_shift); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_fatzap(dn)); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* Cursor tests. */ + +/* + * Basic cursor test. Add a bunch of keys+values to a ZAP, read them back + * via cursor, confirm they're all there and nothing else is. + */ +static MunitResult +test_cursor(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* For each ASCII letter as key, add a unique value to the ZAP. */ + for (int i = 0; i < 26; i++) { + char c = (char)i + 'a'; + char k[2] = { c, '\0' }; + uint64_t v = (uint64_t)c * 11; + unit_ok(zap_add_by_dnode(dn, k, sizeof (uint64_t), 1, &v, tx)); + } + + /* Sanity check; confirm they're all there by count. */ + uint64_t count = 0; + unit_ok(zap_count_by_dnode(dn, &count)); + unit_eq(count, 26); + + zap_cursor_t zc; + zap_attribute_t *za = zap_attribute_alloc(); + + unit_ok(zap_cursor_init_by_dnode(&zc, dn)); + + /* + * Cursors don't guarantee an order, so we run over them them all, + * confirm the key matches the value, and then set a bit for each + * one we've seen. By the end, we should have seen them all. + */ + uint64_t seen = 0; + for (int i = 0; i < 26; i++) { + unit_ok(zap_cursor_retrieve(&zc, za)); + + /* Confirm attribute has the right details for the value. */ + unit_eq(za->za_integer_length, sizeof (uint64_t)); + unit_eq(za->za_num_integers, 1); + + /* + * And the right key in za_name. Note that we don't check + * za_name_len, which is the length of a buffer that can + * definitely hold the key, not the key length itself. + */ + char c = za->za_name[0]; + unit_true(c >= 'a' && c <= 'z'); + unit_zero(za->za_name[1]); + + /* Check the value in the attribute. */ + uint64_t v = (uint64_t)c * 11; + unit_eq(za->za_first_integer, v); + + /* + * Also do a direct lookup and confirm the value matches + * the value from the attribute. + */ + char k[2] = { c, '\0' }; + uint64_t result = 0; + unit_ok(zap_lookup_by_dnode(dn, k, + sizeof (uint64_t), 1, &result)); + unit_eq(result, v); + + /* This one is good, set the bit to remember this fact. */ + seen |= 1 << (c-'a'); + + zap_cursor_advance(&zc); + } + + /* There should be no more keys in the ZAP. */ + unit_err(zap_cursor_retrieve(&zc, za), ENOENT); + + /* Bits 0-25 should be set if we've seen them all. */ + unit_eq(seen, (1 << 26) - 1); + + zap_attribute_free(za); + zap_cursor_fini(&zc); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* + * Cursor serialize test. Add a bunch of items, use the cursor to read half of + * them back, then serialize the cursor. Reload the cursor from the serialized + * state and confirm that we pick up where we left off. Then do it again to + * ensure it doesn't rely on any internal state. + */ +static MunitResult +test_cursor_serialize(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* For each ASCII letter as key, add a unique value to the ZAP. */ + for (int i = 0; i < 26; i++) { + char c = (char)i + 'a'; + char k[2] = { c, '\0' }; + uint64_t v = (uint64_t)c * 11; + unit_ok(zap_add_by_dnode(dn, k, sizeof (uint64_t), 1, &v, tx)); + } + + /* Sanity check; confirm they're all there by count. */ + uint64_t count = 0; + unit_ok(zap_count_by_dnode(dn, &count)); + unit_eq(count, 26); + + /* + * Like test_cursor above, we'll walk over the ZAP and set bits + * for each key we see. + */ + zap_cursor_t zc; + zap_attribute_t *za = zap_attribute_alloc(); + uint64_t seen = 0; + + unit_ok(zap_cursor_init_by_dnode(&zc, dn)); + for (int i = 0; i < 13; i++) { + unit_ok(zap_cursor_retrieve(&zc, za)); + + char c = za->za_name[0]; + unit_true(c >= 'a' && c <= 'z'); + + /* This one is good, set the bit to remember this fact. */ + seen |= 1 << (c-'a'); + + zap_cursor_advance(&zc); + } + + /* Serialise the and terminate the cursor. */ + uint64_t cookie = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + /* + * Record the bits we saw in the first iteration; we'll use this + * when we reload the cursor a second time below. + */ + uint64_t orig_seen = seen; + + /* Reinitialise the cursor from the cookie. */ + unit_ok(zap_cursor_init_serialized_by_dnode(&zc, dn, cookie)); + + /* Loop over the remaining entries and track them. */ + for (int i = 0; i < 13; i++) { + unit_ok(zap_cursor_retrieve(&zc, za)); + + char c = za->za_name[0]; + unit_true(c >= 'a' && c <= 'z'); + + /* This one is good, set the bit to remember this fact. */ + seen |= 1 << (c-'a'); + + zap_cursor_advance(&zc); + } + + /* There should be no more keys in the ZAP. */ + unit_err(zap_cursor_retrieve(&zc, za), ENOENT); + + /* Bits 0-25 should be set if we've seen them all. */ + unit_eq(seen, (1 << 26) - 1); + + /* Cursor done. */ + zap_cursor_fini(&zc); + + /* + * Restore the seen state to before when we reinitialised the saved + * cursor. + */ + seen = orig_seen; + + /* + * Do it all again a second time. This is making sure that the saved + * cursor is usable even after the its been "used". + */ + unit_ok(zap_cursor_init_serialized_by_dnode(&zc, dn, cookie)); + for (int i = 0; i < 13; i++) { + unit_ok(zap_cursor_retrieve(&zc, za)); + + char c = za->za_name[0]; + unit_true(c >= 'a' && c <= 'z'); + + seen |= 1 << (c-'a'); + + zap_cursor_advance(&zc); + } + + unit_err(zap_cursor_retrieve(&zc, za), ENOENT); + unit_eq(seen, (1 << 26) - 1); + + zap_attribute_free(za); + zap_cursor_fini(&zc); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* + * The following tests confirm that the cursor is properly cleaning up dnode + * holds taken (or not) across the lifetime of the cursor. The test is not + * about how or when it takes holds, only that the dnode refcount is the + * same before zap_cursor_init() as after zap_cursor_fini(). + */ +static MunitResult +test_cursor_release_unused(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + + uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn); + + zap_cursor_t zc; + unit_ok(zap_cursor_init_by_dnode(&zc, dn)); + zap_cursor_fini(&zc); + + unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn)); + + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +static MunitResult +test_cursor_release_advance(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + + uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn); + + zap_cursor_t zc; + unit_ok(zap_cursor_init_by_dnode(&zc, dn)); + zap_cursor_advance(&zc); + zap_cursor_fini(&zc); + + unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn)); + + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +static MunitResult +test_cursor_release_empty(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + + uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn); + + zap_cursor_t zc; + zap_attribute_t *za = zap_attribute_alloc(); + + unit_ok(zap_cursor_init_by_dnode(&zc, dn)); + unit_err(zap_cursor_retrieve(&zc, za), ENOENT); + + zap_attribute_free(za); + zap_cursor_fini(&zc); + + unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn)); + + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +static MunitResult +test_cursor_release_one(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + uint64_t v = 1; + unit_ok(zap_add_by_dnode(dn, "a", sizeof (uint64_t), 1, &v, tx)); + unit_ok(zap_add_by_dnode(dn, "b", sizeof (uint64_t), 1, &v, tx)); + + uint64_t refcount = mock_dnode_refcount((mock_dnode_t *)dn); + + zap_cursor_t zc; + zap_attribute_t *za = zap_attribute_alloc(); + + unit_ok(zap_cursor_init_by_dnode(&zc, dn)); + unit_ok(zap_cursor_retrieve(&zc, za)); + + zap_attribute_free(za); + zap_cursor_fini(&zc); + + unit_eq(refcount, mock_dnode_refcount((mock_dnode_t *)dn)); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* zap_value_search: find key with given uint64 value. */ +static MunitResult +test_zap_value_search(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* Add some items. */ + uint64_t v1 = 1, v2 = 2, v3 = 3; + unit_ok(zap_add_by_dnode(dn, "one", sizeof (uint64_t), 1, &v1, tx)); + unit_ok(zap_add_by_dnode(dn, "two", sizeof (uint64_t), 1, &v2, tx)); + unit_ok(zap_add_by_dnode(dn, "three", sizeof (uint64_t), 1, &v3, tx)); + + char name[ZAP_MAXNAMELEN]; + + /* Find one of them. */ + unit_ok(zap_value_search_by_dnode(dn, 2, 0, name, sizeof (name))); + unit_str_eq(name, "two"); + + /* Nonexistent value. */ + unit_err(zap_value_search_by_dnode(dn, 10, 0, + name, sizeof (name)), ENOENT); + + /* Buffer too small for the key. */ + unit_err(zap_value_search_by_dnode(dn, 3, 0, name, 2), ENAMETOOLONG); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* zap_value_search: value masks */ +static MunitResult +test_zap_value_search_mask(const MunitParameter params[], void *data) +{ + (void) data; + + dnode_t *dn = mock_zap_create_params(params, "type"); + dmu_tx_t *tx = (dmu_tx_t *)mock_tx_create(); + + /* + * Add a set of values. These all have the same bottom 16 bits, with + * different upper 48 bits, segmented so we can mask them in different + * and interesting ways. + */ + uint64_t v1 = 0x000000000000f0f0ull; + uint64_t v2 = 0x00000000fffff0f0ull; + uint64_t v3 = 0x0000ffff0000f0f0ull; + uint64_t v4 = 0xffff00000000f0f0ull; + + /* + * Generate four random keys. We do this because zap_value_search() is + * implemented with a simple cursor walk, so will always return the + * first match in hash order, which with fixed keys will always give + * exactly the same results. Using random keys ensures the test values + * are encountered in different orders between test runs, giving us + * better coverage when there are multiple matches. + */ + + char k1[9], k2[9], k3[9], k4[9]; + unit_rand_str(k1, sizeof (k1)); + unit_rand_str(k2, sizeof (k2)); + unit_rand_str(k3, sizeof (k3)); + unit_rand_str(k4, sizeof (k4)); + + unit_ok(zap_add_by_dnode(dn, k1, sizeof (uint64_t), 1, &v1, tx)); + unit_ok(zap_add_by_dnode(dn, k2, sizeof (uint64_t), 1, &v2, tx)); + unit_ok(zap_add_by_dnode(dn, k3, sizeof (uint64_t), 1, &v3, tx)); + unit_ok(zap_add_by_dnode(dn, k4, sizeof (uint64_t), 1, &v4, tx)); + + char name[ZAP_MAXNAMELEN]; + + /* 0 mask is equivalent to all bits set in mask ie exact match. */ + unit_ok(zap_value_search_by_dnode(dn, + 0xf0f0, 0, name, sizeof (name))); + unit_str_eq(name, k1); + unit_ok(zap_value_search_by_dnode(dn, + 0xf0f0, 0xffffffffffffffffull, name, sizeof (name))); + unit_str_eq(name, k1); + + /* Low 16 bits could match any. */ + unit_ok(zap_value_search_by_dnode(dn, + 0xf0f0, 0xffff, name, sizeof (name))); + + /* Low 32 bits, 3/1 matches. */ + unit_ok(zap_value_search_by_dnode(dn, + 0x0000f0f0, 0xffffffff, name, sizeof (name))); + unit_true(strcmp(name, k1) == 0 || strcmp(name, k3) == 0 || + strcmp(name, k4) == 0); + unit_ok(zap_value_search_by_dnode(dn, + 0xfffff0f0, 0xffffffff, name, sizeof (name))); + unit_str_eq(name, k2); + + /* Low 48 bits, 2/1/1 matches */ + unit_ok(zap_value_search_by_dnode(dn, + 0x00000000f0f0ull, 0xffffffffffffull, name, sizeof (name))); + unit_true(strcmp(name, k1) == 0 || strcmp(name, k4) == 0); + unit_ok(zap_value_search_by_dnode(dn, + 0x0000fffff0f0ull, 0xffffffffffffull, name, sizeof (name))); + unit_str_eq(name, k2); + unit_ok(zap_value_search_by_dnode(dn, + 0xffff0000f0f0ull, 0xffffffffffffull, name, sizeof (name))); + unit_str_eq(name, k3); + + /* Value doesn't exist directly, but matches when mask applied. */ + unit_ok(zap_value_search_by_dnode(dn, + 0xffffffff, 0xffff0000, name, sizeof (name))); + unit_str_eq(name, k2); + + mock_tx_destroy((mock_dmu_tx_t *)tx); + unit_true(mock_zap_is_params(dn, params, "type")); + mock_zap_destroy(dn); + + return (MUNIT_OK); +} + +/* ========== */ + +/* Test suite definition and boilerplate. */ + +#define UNIT_PARAM_ZAP_TYPES(p) \ + UNIT_PARAM((p), "micro", "fat") + +static const MunitParameterEnum zap_type_params[] = { + UNIT_PARAM_ZAP_TYPES("type"), + { 0 }, +}; + +#define UNIT_TEST_ZAP_TYPES(name, func) \ + UNIT_TEST(name, func, zap_type_params) + +static const MunitTest zap_tests[] = { + UNIT_TEST("mock_microzap_sanity", test_mock_microzap_sanity), + UNIT_TEST("mock_fatzap_sanity", test_mock_fatzap_sanity), + + UNIT_TEST_ZAP_TYPES("zap_basic", test_zap_basic), + + UNIT_TEST_ZAP_TYPES("zap_add", test_zap_add), + UNIT_TEST_ZAP_TYPES("zap_update", test_zap_update), + UNIT_TEST_ZAP_TYPES("zap_remove", test_zap_remove), + UNIT_TEST_ZAP_TYPES("zap_count", test_zap_count), + UNIT_TEST_ZAP_TYPES("zap_contains", test_zap_contains), + UNIT_TEST_ZAP_TYPES("zap_length", test_zap_length), + + UNIT_TEST_ZAP_TYPES("zap_increment", test_zap_increment), + + UNIT_TEST_ZAP_TYPES("zap_int", test_zap_int), + UNIT_TEST_ZAP_TYPES("zap_int_keys", test_zap_int_keys), + + UNIT_TEST("microzap_stats", test_microzap_stats), + UNIT_TEST("fatzap_stats", test_fatzap_stats), + + UNIT_TEST_ZAP_TYPES("cursor", test_cursor), + UNIT_TEST_ZAP_TYPES("cursor_serialize", test_cursor_serialize), + + UNIT_TEST_ZAP_TYPES( + "cursor_release_unused", test_cursor_release_unused), + UNIT_TEST_ZAP_TYPES( + "cursor_release_advance", test_cursor_release_advance), + UNIT_TEST_ZAP_TYPES( + "cursor_release_empty", test_cursor_release_empty), + UNIT_TEST_ZAP_TYPES( + "cursor_release_one", test_cursor_release_one), + + UNIT_TEST_ZAP_TYPES( + "zap_value_search", test_zap_value_search), + UNIT_TEST_ZAP_TYPES( + "zap_value_search_mask", test_zap_value_search_mask), + + { 0 }, +}; + +static const MunitSuite zap_test_suite = { + "zap.", + zap_tests, + NULL, + 1, + MUNIT_SUITE_OPTION_NONE, +}; + +int +main(int argc, char **argv) +{ + mock_crc64_init(); + + zap_init(); + + int rc = munit_suite_main(&zap_test_suite, NULL, argc, argv); + + zap_fini(); + + return (rc); +} diff --git a/sys/contrib/openzfs/tests/unit/unit.c b/sys/contrib/openzfs/tests/unit/unit.c new file mode 100644 index 00000000000..3dd2e7de5d5 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/unit.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2026, TrueNAS. + */ + +/* Core stubs, applicable to all test suites. */ + +#include +#include + +#include +#include +#include + +#include "munit.h" +#include "unit.h" + +/* + * SET_ERROR() expands to __set_error() in debug builds. It's an + * under-the-hood tracing aid in production; a no-op is fine. + */ +void +__set_error(const char *file, const char *func, int line, int err) +{ + (void) file; (void) func; (void) line; (void) err; +} + +/* Plumb logging and debug into munit for convenience. */ + +/* dprintf() checks zfs_flags and calls __dprintf() in debug builds. */ +int zfs_dbgmsg_enable = 1; +int zfs_flags = ZFS_DEBUG_DPRINTF; + +/* Log dprintf() to MUNIT_LOG_DEBUG. */ +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + char buf[1024]; + + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, sizeof (buf), fmt, ap); + va_end(ap); + + munit_logf_ex(MUNIT_LOG_DEBUG, NULL, 0, "%s%s:%d [%s]: %s", + dprint ? "dprintf: " : "", file, line, func, buf); +} + +/* Log cmn_err() to MUNIT_LOG_INFO or WARNING, abort test on CE_PANIC. */ +void +cmn_err(int ce, const char *fmt, ...) +{ + if (ce == CE_IGNORE) + return; + + char buf[1024]; + + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, sizeof (buf), fmt, ap); + va_end(ap); + + switch (ce) { + case CE_WARN: + munit_logf_ex(MUNIT_LOG_WARNING, NULL, 0, "%s", buf); + break; + case CE_PANIC: + munit_errorf_ex(NULL, 0, "PANIC: %s", buf); + break; + default: + munit_logf_ex(MUNIT_LOG_INFO, NULL, 0, "%s", buf); + break; + } +} + +/* helpers to generate useful random data */ +uint64_t +unit_rand_uint64(void) +{ + uint64_t v = + (((uint64_t)munit_rand_uint32()) << 32) | + ((uint64_t)munit_rand_uint32()); + return (v); +} + +char * +unit_rand_str(char *buf, size_t bufsz) +{ + for (int i = 0; i < bufsz-1; i++) + buf[i] = munit_rand_int_range('a', 'z'); + buf[bufsz-1] = '\0'; + return (buf); +} diff --git a/sys/contrib/openzfs/tests/unit/unit.h b/sys/contrib/openzfs/tests/unit/unit.h new file mode 100644 index 00000000000..a8c23da4118 --- /dev/null +++ b/sys/contrib/openzfs/tests/unit/unit.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2026, TrueNAS. + */ + +#ifndef UNIT_H +#define UNIT_H + +#include "munit.h" + +/* test/suite definition helpers */ + +/* single element in a MunitTest array */ +#define _UNIT_TEST(name, func, params, ...) \ + { (name), (func), NULL, NULL, MUNIT_TEST_OPTION_NONE, \ + (MunitParameterEnum*)(params) } +#define UNIT_TEST(name, func, ...) \ + _UNIT_TEST(name, func, ##__VA_ARGS__, NULL) + +/* single element in a MunitParameterEnum array */ +#define UNIT_PARAM(name, ...) \ + { (char *)(name), (char **)(const char *[]) { __VA_ARGS__, NULL } } + +/* shortcut for truthy tests */ +#define unit_true(a) munit_assert_true(a) +#define unit_false(a) munit_assert_false(a) + +/* shortcut for zero test */ +#define unit_zero(a) munit_assert_uint64((a), ==, 0) + +/* shortcuts for integer comparisons */ +#define _unit_op(a, op, b) munit_assert_uint64((a), op, (b)) + +#define unit_eq(a, b) _unit_op((a), ==, (b)) +#define unit_ne(a, b) _unit_op((a), !=, (b)) +#define unit_le(a, b) _unit_op((a), <=, (b)) +#define unit_ge(a, b) _unit_op((a), >=, (b)) +#define unit_lt(a, b) _unit_op((a), <, (b)) +#define unit_gt(a, b) _unit_op((a), >, (b)) + +/* shortcuts for string comparisons */ +#define unit_str_eq(a, b) munit_assert_string_equal(a, b) +#define unit_str_ne(a, b) munit_assert_string_not_equal(a, b) + +/* shortcuts for error-returning function call */ +#define unit_ok(a) munit_assert_int((a), ==, 0) +#define unit_err(a, e) munit_assert_int((a), ==, (e)) + +/* helpers to generate useful random data */ +extern uint64_t unit_rand_uint64(void); +extern char *unit_rand_str(char *buf, size_t bufsz); + +#endif /* UNIT_H */ diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am index 9f92310985e..3275c1358aa 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am @@ -35,7 +35,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/crypto_test %C%_crypto_test_LDADD = libzpool.la scripts_zfs_tests_bin_PROGRAMS += %D%/clone_after_trunc -%C%_clone_after_trunc_LDADD = -lpthread if WANT_DEVNAME2DEVID scripts_zfs_tests_bin_PROGRAMS += %D%/devname2devid @@ -71,7 +70,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/libzfs_mnttab_cache_check libzfs.la scripts_zfs_tests_bin_PROGRAMS += %D%/manipulate_user_buffer -%C%_manipulate_user_buffer_LDADD = -lpthread scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree %C%_mkfile_LDADD = $(LTLIBINTL) @@ -80,7 +78,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree scripts_zfs_tests_bin_PROGRAMS += \ %D%/mmap_exec %D%/mmap_ftruncate %D%/mmap_seek \ %D%/mmap_sync %D%/mmapwrite %D%/readmmap %D%/mmap_write_sync -%C%_mmapwrite_LDADD = -lpthread if WANT_MMAP_LIBAIO scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_libaio @@ -95,7 +92,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/nvlist_to_lua libnvpair.la scripts_zfs_tests_bin_PROGRAMS += %D%/rm_lnkcnt_zero_file -%C%_rm_lnkcnt_zero_file_LDADD = -lpthread scripts_zfs_tests_bin_PROGRAMS += %D%/send_doall %C%_send_doall_LDADD = \ @@ -107,7 +103,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/stride_dd %C%_stride_dd_LDADD = -lrt scripts_zfs_tests_bin_PROGRAMS += %D%/threadsappend -%C%_threadsappend_LDADD = -lpthread scripts_zfs_tests_bin_PROGRAMS += %D%/ereports %C%_ereports_LDADD = \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check.c index 4ef249bbd4a..8f7e36d9efa 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check.c @@ -85,7 +85,6 @@ static const zfs_ioc_t ioc_skip[] = { ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, - ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, @@ -125,11 +124,136 @@ static const zfs_ioc_t ioc_skip[] = { lzc_ioctl_test(ioc, name, req, opt, err, wild); \ } while (0) +#define IOC_INPUT_TEST_INJECT(ioc, name, innvl) \ + do { \ + active_test = __func__ + 5; \ + lzc_ioctl_run_impl(ioc, name, innvl, 0, B_TRUE); \ + } while (0) + +/* + * Given a zfs_cmd_t containing an already packed nvlist in zc->zc_nvlist_src, + * and its original innvl, look in innvl for the last string nvpair, or last + * string array nvpair, and remove the string terminator. The idea is to + * corrupt the nvlist string value so that anyone doing a strlen() on it will + * read past the end of the packed nvlist buffer and trigger a crash. + */ +static void +do_bad_string(zfs_cmd_t *zc, nvlist_t *innvl) +{ + nvpair_t *elem = NULL; + nvpair_t *lastseen = NULL; + const char *str = NULL; + const char **arr; + uint_t n; + char *off; + char *packed; + uint64_t size, off_size; + + while ((elem = nvlist_next_nvpair(innvl, elem)) != NULL) { + if ((nvpair_type(elem) == DATA_TYPE_STRING) || + (nvpair_type(elem) == DATA_TYPE_STRING_ARRAY)) + lastseen = elem; + } + + if (lastseen == NULL) + return; /* No strings */ + + /* + * Lookup either the last string, or the last string in the last + * string array in the nvlist. We will use this to corrupt from the + * string to the end of the nvlist buffer. Any attempts to strlen this + * string should run pass the end of the packed buffer. + */ + if (nvpair_value_string(lastseen, &str) != 0) { + if (nvpair_value_string_array(lastseen, &arr, &n) == 0) + str = arr[n-1]; + } + + /* + * We now have the last string. Corrupt everything from the NULL + * terminator byte for the last string to the end of the packed nvlist + * buffer. + */ + packed = (char *)zc->zc_nvlist_src; + size = zc->zc_nvlist_src_size; + + off = memmem(packed, size, str, strlen(str)); + off_size = strlen(str); + + memset(&off[off_size - 1], '!', (packed + size) - + (&off[off_size - 1])); + +} + +/* + * For each byte in the packed nvlist list in zc, corrupt a single byte, then + * try doing the ioctl. This tests how well the kernel handles fuzzed nvlists. + * + * NOTE - make sure you are doing this with a "safe" ioctl! You don't want to + * run this on an ioctl that can potentially corrupt data (like a zpool create). + */ +static void +do_fuzz(int zfs_fd, zfs_ioc_t ioc, zfs_cmd_t *zc) +{ + uint64_t size; + uint64_t i; + unsigned char old = 0; + unsigned char *pos; + zfs_cmd_t orig_zc = *zc; + + pos = (unsigned char *) zc->zc_nvlist_src; + size = zc->zc_nvlist_src_size; + + /* + * Fuzz each byte in the packed nvlist, one byte at a time, and do the + * ioctl. If the kernel doesn't crash, then the test passed. + */ + for (i = 0; i < size; i++) { + /* Restore the previously corrupted byte */ + if (i > 0) + pos[i-1] = old; + + old = pos[i]; + + /* Corrupt the new byte */ + pos[i]++; + + /* + * Do the ioctl and ignore the return code. We just want to + * see if the kernel panics. + */ + lzc_ioctl_fd(zfs_fd, ioc, zc); + + /* + * Restore 'zc' with original fields since the ioctl may + * have modified them. + */ + *zc = orig_zc; + } + /* Restore last byte */ + if (i > 0) + pos[i - 1] = old; + + /* + * Try fuzzing the packed nvlist size field. Test it with one byte + * bigger and one byte smaller than the current value. + */ + zc->zc_nvlist_src_size--; + lzc_ioctl_fd(zfs_fd, ioc, zc); + + zc->zc_nvlist_src_size += 2; + lzc_ioctl_fd(zfs_fd, ioc, zc); + + /* Restore to normal */ + zc->zc_nvlist_src_size -= 1; +} + /* * run a zfs ioctl command, verify expected results and log failures */ static void -lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) +lzc_ioctl_run_impl(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, + int expected, boolean_t do_corrupt) { zfs_cmd_t zc = {"\0"}; char *packed = NULL; @@ -160,10 +284,30 @@ lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); zc.zc_nvlist_dst = (uint64_t)(uintptr_t)malloc(zc.zc_nvlist_dst_size); + if (do_corrupt) { + /* + * Try changing bytes in the packed nvlist to see if it will + * panic the kernel when you do the ioctl. + */ + do_fuzz(zfs_fd, ioc, &zc); + + /* + * Corrupt the last string in the packed nvlist so it has no + * NULL terminator. + */ + do_bad_string(&zc, innvl); + + } + if (lzc_ioctl_fd(zfs_fd, ioc, &zc) != 0) error = errno; - if (error != expected) { + /* + * If we're corrupting the nvlist we don't care about the specific + * error code that gets returned, as it could be one of many. We only + * care if it panics the kernel. + */ + if (!do_corrupt && error != expected) { unexpected_failures = B_TRUE; (void) fprintf(stderr, "%s: Unexpected result with %s, " "error %d (expecting %d)\n", @@ -174,6 +318,12 @@ lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) free((void *)(uintptr_t)zc.zc_nvlist_dst); } +static void +lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) +{ + return (lzc_ioctl_run_impl(ioc, name, innvl, expected, B_FALSE)); +} + /* * Test each ioc for the following ioctl input errors: * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel @@ -310,6 +460,7 @@ test_log_history(const char *pool) fnvlist_add_string(required, "message", "input check"); IOC_INPUT_TEST(ZFS_IOC_LOG_HISTORY, pool, required, NULL, 0); + IOC_INPUT_TEST_INJECT(ZFS_IOC_LOG_HISTORY, pool, required); nvlist_free(required); } @@ -791,6 +942,20 @@ test_set_bootenv(const char *pool) nvlist_free(required); } +static void +test_zpool_get(const char *pool) +{ + const char *strs[] = {ZPOOL_DEDUPCACHED_PROP_NAME}; + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_string_array(optional, ZPOOL_GET_PROPS_NAMES, strs, 1); + + IOC_INPUT_TEST(ZFS_IOC_POOL_GET_PROPS, pool, NULL, optional, 0); + IOC_INPUT_TEST_INJECT(ZFS_IOC_POOL_GET_PROPS, pool, optional); + + nvlist_free(optional); +} + static void zfs_ioc_input_tests(const char *pool) { @@ -885,6 +1050,7 @@ zfs_ioc_input_tests(const char *pool) test_scrub(pool); + test_zpool_get(pool); /* * cleanup */ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index c4bcfea5595..c7931ca95e2 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -376,6 +376,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/rsend/rsend.kshlib \ functional/scrub_mirror/default.cfg \ functional/scrub_mirror/scrub_mirror_common.kshlib \ + functional/send_xdr_encoding/send_xdr_encoding.cfg \ + functional/send_xdr_encoding/send_xdr_encoding.kshlib \ functional/slog/slog.cfg \ functional/slog/slog.kshlib \ functional/snapshot/snapshot.cfg \ @@ -434,6 +436,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ + functional/alloc_class/alloc_class_014_pos.ksh \ + functional/alloc_class/alloc_class_015_neg.ksh \ functional/alloc_class/alloc_class_016_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ @@ -648,6 +652,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_encrypted.ksh \ functional/cli_root/zdb/zdb_encrypted_raw.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ + functional/cli_root/zdb/zdb_file_layout_001.ksh \ + functional/cli_root/zdb/zdb_file_layout_002.ksh \ + functional/cli_root/zdb/zdb_file_layout_003.ksh \ + functional/cli_root/zdb/zdb_file_layout_neg.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ functional/cli_root/zdb/zdb_objset_id.ksh \ @@ -806,6 +814,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ + functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ functional/cli_root/zfs_program/cleanup.ksh \ @@ -1295,6 +1304,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ functional/cli_root/zpool_set/zpool_set_ashift.ksh \ + functional/cli_root/zpool_set/zpool_set_inherit.ksh \ functional/cli_root/zpool_set/user_property_001_pos.ksh \ functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ @@ -1495,6 +1505,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/deadman/deadman_zio.ksh \ functional/dedup/cleanup.ksh \ functional/dedup/setup.ksh \ + functional/dedup/dedup_bclone.ksh \ + functional/dedup/dedup_bclone_pruned.ksh \ functional/dedup/dedup_fdt_create.ksh \ functional/dedup/dedup_fdt_import.ksh \ functional/dedup/dedup_fdt_pacing.ksh \ @@ -1608,6 +1620,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ + functional/fault/auto_spare_rotational.ksh \ functional/fault/auto_spare_double.ksh \ functional/fault/auto_spare_multiple.ksh \ functional/fault/auto_spare_shared.ksh \ @@ -2123,6 +2136,22 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/scrub_mirror/scrub_mirror_003_pos.ksh \ functional/scrub_mirror/scrub_mirror_004_pos.ksh \ functional/scrub_mirror/setup.ksh \ + functional/send_xdr_encoding/cleanup.ksh \ + functional/send_xdr_encoding/setup.ksh \ + functional/send_xdr_encoding/xdr_bookmark_raw.ksh \ + functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh \ + functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh \ + functional/send_xdr_encoding/xdr_incr_from_redacted.ksh \ + functional/send_xdr_encoding/xdr_raw.ksh \ + functional/send_xdr_encoding/xdr_redacted_full.ksh \ + functional/send_xdr_encoding/xdr_redacted_received.ksh \ + functional/send_xdr_encoding/xdr_redacted_received_raw.ksh \ + functional/send_xdr_encoding/xdr_replication.ksh \ + functional/send_xdr_encoding/xdr_resume.ksh \ + functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh \ + functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh \ + functional/send_xdr_encoding/xdr_resume_raw.ksh \ + functional/send_xdr_encoding/xdr_resume_redacted.ksh \ functional/slog/cleanup.ksh \ functional/slog/setup.ksh \ functional/slog/slog_001_pos.ksh \ @@ -2264,6 +2293,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/vdev_zaps/vdev_zaps_005_pos.ksh \ functional/vdev_zaps/vdev_zaps_006_pos.ksh \ functional/vdev_zaps/vdev_zaps_007_pos.ksh \ + functional/vdev_zaps/vdev_zaps_008_pos.ksh \ functional/write_dirs/cleanup.ksh \ functional/write_dirs/setup.ksh \ functional/write_dirs/write_dirs_001_pos.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib index be281c62404..649a6ec601c 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib @@ -67,3 +67,37 @@ function display_status return $ret } + +# +# Verify the file identified by the input is written on a special vdev +# According to the pool layout used in this test vdev_id 3 and 4 are special +# +function file_in_special_vdev # +{ + typeset dataset="$1" + typeset inum="$2" + typeset num_normal=$(echo $ZPOOL_DISKS | wc -w) + num_normal=${num_normal##* } + + zdb -dddddd $dataset $inum | awk -v d=$num_normal '{ +# find DVAs from string "offset level dva" only for L0 (data) blocks +if (match($0,"L0 [0-9]+")) { + dvas[0]=$3 + dvas[1]=$4 + dvas[2]=$5 + for (i = 0; i < 3; ++i) { + if (match(dvas[i],"([^:]+):.*")) { + dva = substr(dvas[i], RSTART, RLENGTH); + # parse DVA from string "vdev:offset:asize" + if (split(dva,arr,":") != 3) { + print "Error parsing DVA: <" dva ">"; + exit 1; + } + # verify vdev is "special" + if (arr[1] < d) { + exit 1; + } + } + } +}}' +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh index 743a717b2e8..3d463b37611 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh @@ -25,41 +25,6 @@ verify_runnable "global" -# -# Verify the file identified by the input is written on a special vdev -# According to the pool layout used in this test vdev_id 3 and 4 are special -# XXX: move this function to libtest.shlib once we get "Vdev Properties" -# -function file_in_special_vdev # -{ - typeset dataset="$1" - typeset inum="$2" - typeset num_normal=$(echo $ZPOOL_DISKS | wc -w) - num_normal=${num_normal##* } - - zdb -dddddd $dataset $inum | awk -v d=$num_normal '{ -# find DVAs from string "offset level dva" only for L0 (data) blocks -if (match($0,"L0 [0-9]+")) { - dvas[0]=$3 - dvas[1]=$4 - dvas[2]=$5 - for (i = 0; i < 3; ++i) { - if (match(dvas[i],"([^:]+):.*")) { - dva = substr(dvas[i], RSTART, RLENGTH); - # parse DVA from string "vdev:offset:asize" - if (split(dva,arr,":") != 3) { - print "Error parsing DVA: <" dva ">"; - exit 1; - } - # verify vdev is "special" - if (arr[1] < d) { - exit 1; - } - } - } -}}' -} - # # Check that device removal works for special class vdevs # diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh new file mode 100755 index 00000000000..27c55bc5906 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_pos.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib + +# +# DESCRIPTION: +# The alloc_bias vdev property is readable and settable on top-level vdevs. +# +# STRATEGY: +# 1. Create a pool with one normal mirror and one special mirror. +# 2. Verify alloc_bias getter returns "none" for normal and "special" +# for the special mirror. +# 3. Verify alloc_bias is not reported for leaf (child) vdevs. +# 4. Set alloc_bias=none on the special vdev; verify getter returns "none". +# 5. Export and import the pool; verify no "special" section in status. +# 6. Set alloc_bias=dedup on the same vdev; verify getter returns "dedup". +# 7. Export and import the pool; verify "dedup" section appears in status. +# 8. Set alloc_bias=special; verify getter returns "special". +# 9. Export and import; verify "special" section appears again. +# + +verify_runnable "global" + +claim="alloc_bias vdev property is readable and settable on top-level vdevs" + +log_assert $claim +log_onexit cleanup + +log_must disk_setup + +# One normal mirror (always stays normal) and one special mirror. +# The normal mirror ensures the pool always has normal-class vdevs +# regardless of what we do to the second mirror. +log_must zpool create $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + +# Find the special vdev name (mirror-N) from zpool status. +TVDEV=$(zpool status $TESTPOOL | \ + awk '/special/{found=1} found && /mirror-/{print $1; exit}') +log_note "Special vdev: $TVDEV" +[[ -n "$TVDEV" ]] || log_fail "Could not determine special vdev name" + +# Verify initial alloc_bias values. +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL mirror-0) +[[ "$BIAS" == "none" ]] || \ + log_fail "Normal mirror alloc_bias: expected none, got $BIAS" + +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "special" ]] || \ + log_fail "Special mirror alloc_bias: expected special, got $BIAS" + +# Verify alloc_bias is not reported for a leaf vdev. +LEAF_OUT=$(zpool get -H -o name,value alloc_bias $TESTPOOL \ + $ZPOOL_DISK0 2>&1) +[[ -z "$LEAF_OUT" ]] || \ + log_fail "alloc_bias reported for leaf vdev, got: $LEAF_OUT" + +# --- special -> none, verify after export/import --- +log_must zpool set alloc_bias=none $TESTPOOL $TVDEV +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "none" ]] || \ + log_fail "After set none: alloc_bias expected none, got $BIAS" + +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL +zpool status $TESTPOOL | grep -q "special" && \ + log_fail "special still shown after alloc_bias=none + reimport" + +# --- none -> dedup, verify after export/import --- +log_must zpool set alloc_bias=dedup $TESTPOOL $TVDEV +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "dedup" ]] || \ + log_fail "After set dedup alloc_bias expected dedup, got $BIAS" + +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL +zpool status $TESTPOOL | grep -q "dedup" || \ + log_fail "dedup not shown after alloc_bias=dedup + reimport" + +# --- dedup -> special, verify after export/import --- +log_must zpool set alloc_bias=special $TESTPOOL $TVDEV +BIAS=$(zpool get -H -o value alloc_bias $TESTPOOL $TVDEV) +[[ "$BIAS" == "special" ]] || \ + log_fail "After set special alloc_bias expected special, got $BIAS" + +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL +zpool status $TESTPOOL | grep -q "special" || \ + log_fail "special not shown after alloc_bias=special + reimport" + +log_must zpool destroy -f $TESTPOOL +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh new file mode 100755 index 00000000000..43740690b3c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_neg.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib + +# +# DESCRIPTION: +# Setting the alloc_bias vdev property to invalid values or on +# unsupported vdev types fails. +# +# STRATEGY: +# 1. Create a pool with a normal mirror and a log vdev. +# 2. Verify setting alloc_bias on a leaf vdev fails. +# 3. Verify setting alloc_bias=log fails. +# 4. Verify setting alloc_bias to an unknown value fails. +# 5. Verify setting alloc_bias on a log vdev fails. +# 6. Verify setting alloc_bias=special fails when allocation_classes +# feature is not enabled. +# 7. Verify converting the last normal vdev fails. +# + +verify_runnable "global" + +claim="Setting alloc_bias to invalid values or on unsupported vdevs fails" + +log_assert $claim +log_onexit cleanup + +log_must disk_setup + +# Create a pool with a normal mirror and a log vdev. +log_must zpool create $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + log $CLASS_DISK0 + +NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +log_note "Normal vdev: $NORMAL_VDEV" + +# Setting alloc_bias on a leaf vdev must fail. +log_mustnot zpool set alloc_bias=special $TESTPOOL $ZPOOL_DISK0 + +# Setting alloc_bias=log must fail (log vdevs must be removed and re-added). +log_mustnot zpool set alloc_bias=log $TESTPOOL $NORMAL_VDEV + +# Setting alloc_bias to an unknown value must fail. +log_mustnot zpool set alloc_bias=bogus $TESTPOOL $NORMAL_VDEV + +# Setting alloc_bias on a log vdev must fail. +# CLASS_DISK0 is a single-disk (non-mirror) top-level log vdev. +log_mustnot zpool set alloc_bias=special $TESTPOOL $CLASS_DISK0 + +log_must zpool destroy -f $TESTPOOL + +# Verify setting alloc_bias=special fails when allocation_classes is disabled. +# Create a pool with the allocation_classes feature explicitly disabled. +log_must zpool create -o feature@allocation_classes=disabled $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 + +NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +log_mustnot zpool set alloc_bias=special $TESTPOOL $NORMAL_VDEV +log_mustnot zpool set alloc_bias=dedup $TESTPOOL $NORMAL_VDEV + +log_must zpool destroy -f $TESTPOOL + +# Verify that converting the last normal-class top-level vdev fails. +# A pool must always retain at least one normal vdev. +log_must zpool create $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + +NORMAL_VDEV=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +log_mustnot zpool set alloc_bias=special $TESTPOOL $NORMAL_VDEV +log_mustnot zpool set alloc_bias=dedup $TESTPOOL $NORMAL_VDEV + +log_must zpool destroy -f $TESTPOOL +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh index 01e9cf49dc8..cda4b0ee953 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh @@ -51,4 +51,7 @@ log_must zfs set recordsize=$RECORDSIZE $TESTDSTFS bclone_corner_cases_test $TESTSRCDIR $TESTDSTDIR +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh index e1b583813f1..0d2c0f6e16c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh @@ -50,4 +50,7 @@ for filesize in 1 107 113 511 512 513 4095 4096 4097 131071 131072 131073 \ bclone_test random $filesize false $TESTSRCDIR $TESTDSTDIR done +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh index d18a1bd2490..619fc3e4216 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh @@ -45,4 +45,7 @@ log_must zfs set recordsize=$RECORDSIZE $TESTSRCFS bclone_corner_cases_test $TESTSRCDIR $TESTSRCDIR +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh index 45551e04646..f1f80a9c059 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh @@ -46,4 +46,7 @@ for filesize in 1 107 113 511 512 513 4095 4096 4097 131071 131072 131073 \ bclone_test random $filesize false $TESTSRCDIR $TESTSRCDIR done +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh index b407d4c541d..d4b7f01e8ba 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_after_device_removal.ksh @@ -57,5 +57,9 @@ log_must zfs create $TESTPOOL/$TESTFS log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/$TESTFS/file log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=16M count=2 log_must zfs destroy -r $TESTPOOL/$TESTFS +wait_freeing $TESTPOOL +sync_pool $TESTPOOL + +log_must zdb -b $TESTPOOL log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh index 4c652923545..7c183234922 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh @@ -83,5 +83,8 @@ typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 $TESTPOOL/$TESTFS file2 # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). log_must [ "$blocks" = "$(seq -s " " 0 1021 | sed 's/ $//')" ] +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass "LWB buffer overflow is not triggered with multiple VDEVs ZIL" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh index 2e854d7e543..ad24c1f06ba 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh @@ -126,4 +126,7 @@ typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ] +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh index eb1464ff4d4..6b9ea354226 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh @@ -128,4 +128,7 @@ typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ] +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh new file mode 100755 index 00000000000..f9c9555b84b --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_001.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# Copyright (c) 2026, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -fHv will display block +# layouts for the object. +# +# Strategery: +# 1. Create a RAIDZ1 pool, set compression to none +# 2. Create a file filled with random data +# 3. Get the inode number of the file +# 4. Run zdb -fHv / & extract file +# 5. Compare real file and extracted file. + +DATA=/$TESTPOOL1/random.bin +BLOCKS=$(( $RANDOM % 16 )) +COMPARE=/tmp/compare.$$ + +function cleanup +{ + destroy_pool $TESTPOOL1 + rm -f $TESTDIR/file?.bin $COMPARE +} + +log_assert "Verify zdb -fHv displays correct offsets" +log_onexit cleanup + +# 1. Create a RAIDZ1 pool +log_must mkdir -p $TESTDIR +for file in 1 2 3 4 5 +do + rm -f $TESTDIR/file${file}.bin + touch $TESTDIR/file${file}.bin + log_must truncate -s 128m $TESTDIR/file${file}.bin +done + +log_must zpool create -O compression=off -O recordsize=16K $TESTPOOL1 raidz1 $TESTDIR/file[12345].bin +zfs get compression,recordsize $TESTPOOL1 +# 2. Create a file with random data +log_must rm -f $DATA +log_must dd if=/dev/urandom of=${DATA} bs=16k count=${BLOCKS} > /dev/null 2>&1 +log_must zpool sync $TESTPOOL1 + +# 3. Get the inode number of the file +INUM=$(ls -li $DATA | cut -f1 -d ' ') + +# 4. Extract the contents of the file using dd +rm -f $COMPARE +log_must touch ${COMPARE} +log_must zdb -fHv $TESTPOOL1/ ${INUM} | grep 'D.$' | + while read file offset count rest + do + log_must sh -c "dd if=$TESTDIR/${file} bs=512 skip=${offset} count=${count} >> ${COMPARE}" + done + +# 5. Compare files +log_must cmp ${COMPARE} ${DATA} + +log_pass "'zdb -fHv' works as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh new file mode 100755 index 00000000000..455ec6ccb21 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_002.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# Copyright (c) 2026, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -fHv will display block +# layouts for the object. +# +# Strategery: +# 1. Create a RAIDZ2 pool, set compression to none +# 2. Create a file filled with random data +# 3. Get the inode number of the file +# 4. Run zdb -fHv / & extract file +# 5. Compare real file and extracted file. + +DATA=/$TESTPOOL1/random.bin +BLOCKS=$(( $RANDOM % 16 )) +COMPARE=/tmp/compare.$$ + +function cleanup +{ + destroy_pool $TESTPOOL1 + rm -f $TESTDIR/file?.bin $COMPARE +} + +log_assert "Verify zdb -fHv displays correct offsets" +log_onexit cleanup + +# 1. Create a RAIDZ1 pool +log_must mkdir -p $TESTDIR +for file in 1 2 3 4 5 6 +do + rm -f $TESTDIR/file${file}.bin + touch $TESTDIR/file${file}.bin + log_must truncate -s 128m $TESTDIR/file${file}.bin +done + +log_must zpool create -O compression=off -O recordsize=16K $TESTPOOL1 raidz2 $TESTDIR/file[123456].bin +zfs get compression,recordsize $TESTPOOL1 +# 2. Create a file with random data +log_must rm -f $DATA +log_must dd if=/dev/urandom of=${DATA} bs=16k count=${BLOCKS} > /dev/null 2>&1 +log_must zpool sync $TESTPOOL1 + +# 3. Get the inode number of the file +INUM=$(ls -li $DATA | cut -f1 -d ' ') + +# 4. Extract the contents of the file using dd +rm -f $COMPARE +log_must touch ${COMPARE} +log_must zdb -fHv $TESTPOOL1/ ${INUM} | grep 'D.$' | + while read file offset count rest + do + log_must sh -c "dd if=$TESTDIR/${file} bs=512 skip=${offset} count=${count} >> ${COMPARE}" + done + +# 5. Compare files +log_must cmp ${COMPARE} ${DATA} + +log_pass "'zdb -fHv' works as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh new file mode 100755 index 00000000000..7673b3488c7 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_003.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# Copyright (c) 2026, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -fHv will display block +# layouts for the object. +# +# Strategery: +# 1. Create a RAIDZ3 pool, set compression to none +# 2. Create a file filled with random data +# 3. Get the inode number of the file +# 4. Run zdb -fHv / & extract file +# 5. Compare real file and extracted file. + +DATA=/$TESTPOOL1/random.bin +BLOCKS=$(( $RANDOM % 16 )) +COMPARE=/tmp/compare.$$ + +function cleanup +{ + destroy_pool $TESTPOOL1 + rm -f $TESTDIR/file?.bin $COMPARE +} + +log_assert "Verify zdb -fHv displays correct offsets" +log_onexit cleanup + +# 1. Create a RAIDZ1 pool +log_must mkdir -p $TESTDIR +for file in 1 2 3 4 5 6 7 +do + rm -f $TESTDIR/file${file}.bin + touch $TESTDIR/file${file}.bin + log_must truncate -s 128m $TESTDIR/file${file}.bin +done + +log_must zpool create -O compression=off -O recordsize=16K $TESTPOOL1 raidz3 $TESTDIR/file[123456].bin +zfs get compression,recordsize $TESTPOOL1 +# 2. Create a file with random data +log_must rm -f $DATA +log_must dd if=/dev/urandom of=${DATA} bs=16k count=${BLOCKS} > /dev/null 2>&1 +log_must zpool sync $TESTPOOL1 + +# 3. Get the inode number of the file +INUM=$(ls -li $DATA | cut -f1 -d ' ') + +# 4. Extract the contents of the file using dd +rm -f $COMPARE +log_must touch ${COMPARE} +log_must zdb -fHv $TESTPOOL1/ ${INUM} | grep 'D.$' | + while read file offset count rest + do + log_must sh -c "dd if=$TESTDIR/${file} bs=512 skip=${offset} count=${count} >> ${COMPARE}" + done + +# 5. Compare files +log_must cmp ${COMPARE} ${DATA} + +log_pass "'zdb -fHv' works as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh new file mode 100755 index 00000000000..124bdb6b6b3 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_file_layout_neg.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019 by Datto, Inc. All rights reserved. +# Copyright (c) 2026, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# Ensure zdb -f only works on raidz +# +# Strategery: +# 1. Create a pool with one disk +# 2. Create a file +# 3. Get the inode number of the file +# 4. Run zdb -f +# 5. Confirm failure status + +function cleanup +{ + destroy_pool $TESTPOOL1 + rm -f $TESTDIR/file1.bin +} + +log_assert "Verify zdb -f fails on non-raidz pool" +log_onexit cleanup + +# 1. Create a RAIDZ1 pool +log_must mkdir -p $TESTDIR +touch $TESTDIR/file1.bin +log_must truncate -s 128m $TESTDIR/file1.bin +log_must zpool create -f $TESTPOOL1 $TESTDIR/file1.bin + +# 2. Create a file +log_must touch /$TESTPOOL1/file.txt + +# 3. Get the inode number of the file +INUM=$(ls -li /$TESTDIR/file1.txt | cut -f1 -d ' ') + +# 4. Run zdb -f +log_mustnot zdb -f $TESTDIR/ $INUM + +log_pass "'zdb -f' fails on non-raidz as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib index 08795a7ea25..5d7ceb97112 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib @@ -27,6 +27,8 @@ # # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2017, loli10K . All rights reserved. +# Copyright (c) 2026, TrueNAS. # . $STF_SUITE/include/libtest.shlib @@ -131,3 +133,129 @@ function verify_mount_display done return 0 } + +# Helper functions to call the system mount(8) with various options +function mount_default # +{ + typeset opts= + if is_freebsd; then + opts="-t zfs" + else + opts="-t zfs" + fi + + mount $opts "$@" + return $? +} + +function mount_ro # +{ + typeset opts= + if is_freebsd; then + opts="-t zfs -r" + else + opts="-t zfs -o ro" + fi + + mount $opts "$@" + return $? +} + +function mount_rw # +{ + typeset opts= + if is_freebsd; then + opts="-t zfs -w" + else + opts="-t zfs -o rw" + fi + + mount $opts "$@" + return $? +} + +function remount_ro # +{ + typeset opts= + if is_freebsd; then + opts="-t zfs -ur" + else + opts="-o remount,ro" + fi + + mount $opts "$@" + return $? +} + +function remount_rw # +{ + typeset opts= + if is_freebsd; then + opts="-t zfs -uw" + else + opts="-o remount,rw" + fi + + mount $opts "$@" + return $? +} + +# +# Verify that $mountpoint is mounted readonly +# This is preferred over "log_mustnot touch $fs" because we actually want to +# verify the error returned is EROFS +# +function mount_is_ro # mountpoint +{ + typeset mountpoint="$1" + + file_write -o create -f $mountpoint/file.dat + ret=$? + if [[ $ret != 30 ]]; then + log_fail "Writing to $mountpoint did not return EROFS ($ret)." + fi +} + +function mount_is_rw # mountpoint +{ + typeset mountpoint="$1" + log_must touch $mountpoint/file.dat +} + +# Get the read-only/read-write option for $mountpoint +# Prints either "ro" or "rw", or nothing if $mountpoint is not in the mount +# table, or is not a ZFS mount. +function mount_get_ro_rw # mountpoint +{ + typeset mountpoint="$1" + + if is_freebsd; then + # tank/hello /tank/hello zfs rw,nfsv4acls 0 0 + mount -p | \ + awk -v mountpoint="$mountpoint" ' + $2 != mountpoint || $3 != "zfs" { next } + $4 ~ /(^|,)ro(,|$)/ { print "ro" } + $4 ~ /(^|,)rw(,|$)/ { print "rw" }' + else + # tank/hello /tank/hello zfs rw,relatime,xattr,noacl,casesensitive 0 0 + awk -v mountpoint="$mountpoint" ' + $2 != mountpoint || $3 != "zfs" { next } + $4 ~ /(^|,)ro(,|$)/ { print "ro" } + $4 ~ /(^|,)rw(,|$)/ { print "rw" }' /proc/mounts + fi +} + +# Verify that $mountpoint is mounted with a "read-only" option +function mount_has_ro_option # mountpoint +{ + typeset ropt=$(mount_get_ro_rw "$1") + log_must test $ropt == "ro" +} + +# Verify that $mountpoint is mounted with a "read-write" option +function mount_has_rw_option # mountpoint +{ + typeset ropt=$(mount_get_ro_rw "$1") + log_must test $ropt == "rw" +} + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh index c54128f7b9e..a16d17a1229 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh @@ -23,6 +23,7 @@ # # Copyright 2017, loli10K . All rights reserved. +# Copyright (c) 2026, TrueNAS. # . $STF_SUITE/include/libtest.shlib @@ -54,54 +55,6 @@ function cleanup return 0 } -if is_freebsd; then - typeset RO="-t zfs -ur" - typeset RW="-t zfs -uw" -else - typeset RO="-o remount,ro" - typeset RW="-o remount,rw" -fi - -# -# Verify the $filesystem is mounted readonly -# This is preferred over "log_mustnot touch $fs" because we actually want to -# verify the error returned is EROFS -# -function readonlyfs # filesystem -{ - typeset filesystem="$1" - - file_write -o create -f $filesystem/file.dat - ret=$? - if [[ $ret != 30 ]]; then - log_fail "Writing to $filesystem did not return EROFS ($ret)." - fi -} - -# -# Verify $dataset is mounted with $option -# -function checkmount # dataset option -{ - typeset dataset="$1" - typeset option="$2" - typeset options="" - - if is_freebsd; then - options=$(mount -p | awk -v ds="$dataset" '$1 == ds { print $4 }') - else - options=$(awk -v ds="$dataset" '$1 == ds { print $4 }' /proc/mounts) - fi - if [[ "$options" == '' ]]; then - log_fail "Dataset $dataset is not mounted" - elif [[ ! -z "${options##*$option*}" ]]; then - log_fail "Dataset $dataset is not mounted with expected "\ - "option $option ($options)" - else - log_note "Dataset $dataset is mounted with option $option" - fi -} - log_assert "Verify remount functionality on both filesystem and snapshots" log_onexit cleanup @@ -117,35 +70,35 @@ MNTPSNAP="$TESTDIR/zfs_snap_mount" log_must mkdir -p $MNTPSNAP # 2. Verify we can (re)mount the dataset readonly/read-write -log_must touch $MNTPFS/file.dat -checkmount $TESTFS 'rw' -log_must mount $RO $TESTFS $MNTPFS -readonlyfs $MNTPFS -checkmount $TESTFS 'ro' -log_must mount $RW $TESTFS $MNTPFS -log_must touch $MNTPFS/file.dat -checkmount $TESTFS 'rw' +mount_is_rw $MNTPFS +mount_has_rw_option $MNTPFS +log_must remount_ro $TESTFS $MNTPFS +mount_is_ro $MNTPFS +mount_has_ro_option $MNTPFS +log_must remount_rw $TESTFS $MNTPFS +mount_is_rw $MNTPFS +mount_has_rw_option $MNTPFS if is_linux; then # 3. Verify we can (re)mount the snapshot readonly - log_must mount -t zfs $TESTSNAP $MNTPSNAP - readonlyfs $MNTPSNAP - checkmount $TESTSNAP 'ro' - log_must mount $RO $TESTSNAP $MNTPSNAP - readonlyfs $MNTPSNAP - checkmount $TESTSNAP 'ro' + log_must mount_default $TESTSNAP $MNTPSNAP + mount_is_ro $MNTPSNAP + mount_has_ro_option $MNTPSNAP + log_must remount_ro $TESTSNAP $MNTPSNAP + mount_is_ro $MNTPSNAP + mount_has_ro_option $MNTPSNAP log_must umount $MNTPSNAP fi # 4. Verify we can't remount a snapshot read-write # The "mount -o rw" command will succeed but the snapshot is mounted readonly. # The "mount -o remount,rw" command must fail with an explicit error. -log_must mount -t zfs -o rw $TESTSNAP $MNTPSNAP -readonlyfs $MNTPSNAP -checkmount $TESTSNAP 'ro' -log_mustnot mount $RW $TESTSNAP $MNTPSNAP -readonlyfs $MNTPSNAP -checkmount $TESTSNAP 'ro' +log_must mount_rw $TESTSNAP $MNTPSNAP +mount_is_ro $MNTPSNAP +mount_has_ro_option $MNTPSNAP +log_mustnot remount_rw $TESTSNAP $MNTPSNAP +mount_is_ro $MNTPSNAP +mount_has_ro_option $MNTPSNAP log_must umount $MNTPSNAP # 5. Verify we can remount a dataset readonly and unmount it with @@ -153,8 +106,8 @@ log_must umount $MNTPSNAP log_must eval "echo 'password' | zfs create -o sync=disabled \ -o encryption=on -o keyformat=passphrase $TESTFS/crypt" CRYPT_MNTPFS="$(get_prop mountpoint $TESTFS/crypt)" -log_must touch $CRYPT_MNTPFS/file.dat -log_must mount $RO $TESTFS/crypt $CRYPT_MNTPFS +mount_is_rw $CRYPT_MNTPFS +log_must remount_ro $TESTFS/crypt $CRYPT_MNTPFS log_must umount -f $CRYPT_MNTPFS sync_pool $TESTPOOL @@ -163,10 +116,10 @@ log_must zpool export $TESTPOOL log_must zpool import -o readonly=on $TESTPOOL # 7. Verify we can't remount its filesystem read-write -readonlyfs $MNTPFS -checkmount $TESTFS 'ro' -log_mustnot mount $RW $MNTPFS -readonlyfs $MNTPFS -checkmount $TESTFS 'ro' +mount_is_ro $MNTPFS +mount_has_ro_option $MNTPFS +log_mustnot remount_rw $MNTPFS +mount_is_ro $MNTPFS +mount_has_ro_option $MNTPFS log_pass "Both filesystem and snapshots can be remounted correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh new file mode 100755 index 00000000000..15e78e6fd88 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_ro_rw.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# we set up and mount multiple times, with these combinations: +# - readonly property: on, off +# - mount method: mount(8) (mountpoint=legacy), zfs-mount(8) (mountpoint=path) +# - mount option: [none], ro, rw +# +# after each mount, we check whether we ended up mounting read-only or +# read-write, and note the result. once we've done them all, we compare the +# result set to the "correct" set for this platform (by observation). the +# test passes if they match, fail if they don't +# +# readonly | on | off | +# mount method | legacy | path | legacy | path | +# mount option | -- ro rw | -- ro rw | -- ro rw | -- ro rw | +typeset -a rs_linux=( rw ro rw ro ro rw rw ro rw rw ro rw ) +typeset -a rs_freebsd=( ro ro ro ro ro rw rw ro rw rw ro rw ) + +if is_linux ; then + typeset -n rs_wanted=rs_linux +elif is_freebsd ; then + typeset -n rs_wanted=rs_freebsd +else + log_unsupported "no result set defined for this platform" +fi + +verify_runnable "both" + +testfs=$TESTPOOL/$TESTFS +testmnt=$TESTDIR/mountpoint + +function cleanup +{ + log_must zfs inherit -S canmount $testfs + log_must zfs inherit readonly $testfs + log_must zfs inherit mountpoint $testfs + log_must rm -rf $testmnt +} + +log_assert "Verify combinations of readonly/readwrite produce correct mount." + +log_onexit cleanup + + +# setup +log_must datasetexists $testfs +log_must zfs set canmount=noauto $testfs +umount $testfs + + +typeset -a rs=() + +for readonly in on off ; do + for method in legacy path ; do + for option in default ro rw ; do + + log_must zfs set readonly=$readonly $testfs + + if [[ $method == 'legacy' ]] ; then + log_must zfs set mountpoint=legacy $testfs + else + log_must zfs set mountpoint=$testmnt $testfs + fi + + # recreate the mountpoint. even if it wasn't mounted, + # changing the mountpoint property can remove it + log_must mkdir -p $testmnt + + # issue the mount with the wanted method and option + case $method in + legacy) + case $option in + default) log_must mount_default $testfs $testmnt ;; + ro) log_must mount_ro $testfs $testmnt ;; + rw) log_must mount_rw $testfs $testmnt ;; + esac + ;; + path) + case $option in + default) log_must zfs mount $testfs ;; + ro) log_must zfs mount -o ro $testfs ;; + rw) log_must zfs mount -o rw $testfs ;; + esac + ;; + esac + + result=$(mount_get_ro_rw $testmnt) + rs+=($result) + log_note "result: $result" + + log_must umount $testfs + done + done +done + +log_note "results: ${rs[@]}" +log_note "wanted: ${rs_wanted[@]}" + +log_must test "${rs[*]}" == "${rs_wanted[*]}" + +log_pass "All mounts correct for this platform." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh index 0d2a39be6b5..c8a69c09aac 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh @@ -31,7 +31,7 @@ verify_runnable "global" function cleanup { - zpool destroy $TESTPOOL + destroy_pool $TESTPOOL rm $tmp } @@ -58,7 +58,7 @@ log_must eval "zdb -m --allocated-map $TESTPOOL > $tmp" log_must zpool destroy $TESTPOOL log_must zpool create $TESTPOOL $DISKS -log_must zpool export $TESTPOOL +log_must_busy zpool export $TESTPOOL log_must eval "zhack metaslab leak $TESTPOOL < $tmp" log_must zpool import $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh index 93c320da6fd..f08e4fb6472 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh @@ -23,11 +23,15 @@ # # Copyright (c) 2024, Klara Inc. +# Copyright (c) 2026, Christos Longros # # -# TODO: this only checks that the set of valid device fault types. It should -# check all the other options, and that they work, and everything really. +# This covers device, label, object, delay, panic injection modes: +# every valid value is accepted and unknown values are rejected. +# A final pass also confirms that a registered injection actually +# executes by watching the inject counter advance after triggering +# the desired injected error. # . $STF_SUITE/include/libtest.shlib @@ -39,6 +43,7 @@ log_assert "Check zinject parameters." log_onexit cleanup DISK1=${DISKS%% *} +TESTFILE=/$TESTPOOL/zinject_args.file function cleanup { @@ -56,8 +61,139 @@ function test_device_fault zinject -c all } +function test_device_fault_neg +{ + log_mustnot eval "zinject -d $DISK1 -e bogus -T read $TESTPOOL" + log_mustnot eval "zinject -d $DISK1 -e io -T bogus $TESTPOOL" + zinject -c all +} + +function test_label_fault +{ + typeset -a labels=("nvlist" "pad1" "pad2" "uber") + for l in ${labels[@]}; do + log_must eval \ + "zinject -d $DISK1 -e io -L $l $TESTPOOL" + done + zinject -c all +} + +function test_label_fault_neg +{ + log_mustnot eval "zinject -d $DISK1 -e io -L bogus $TESTPOOL" + zinject -c all +} + +function test_object_fault +{ + log_must dd if=/dev/urandom of=$TESTFILE bs=128k count=1 + log_must zpool sync $TESTPOOL + + for t in data dnode; do + log_must eval "zinject -t $t -e io -f 0.001 $TESTFILE" + done + zinject -c all + + for t in mos mosdir metaslab config bpobj spacemap errlog; do + log_must eval "zinject -t $t -e io -f 0.001 $TESTPOOL" + done + zinject -c all +} + +function test_object_fault_neg +{ + log_mustnot eval "zinject -t bogus -e io $TESTPOOL" + log_mustnot eval "zinject -t data -e bogus $TESTFILE" + # -t data only accepts checksum or io as the error type. + log_mustnot eval "zinject -t data -e nxio $TESTFILE" + zinject -c all +} + +function test_delay_fault +{ + log_must eval "zinject -d $DISK1 -D 10:1 $TESTPOOL" + log_must eval "zinject -d $DISK1 -D 25:2 -T read $TESTPOOL" + log_must eval "zinject -d $DISK1 -D 25:2 -T write $TESTPOOL" + zinject -c all +} + +function test_delay_fault_neg +{ + log_mustnot eval "zinject -d $DISK1 -D 0:1 $TESTPOOL" + log_mustnot eval "zinject -d $DISK1 -D 10 $TESTPOOL" + log_mustnot eval "zinject -d $DISK1 -D foo $TESTPOOL" + zinject -c all +} + +function test_panic_fault +{ + # An unmatched function tag so zio_handle_panic_injection() never fires. + log_must eval "zinject -p zfs_test_no_such_fn $TESTPOOL" + log_must eval "zinject -p zfs_test_no_such_fn $TESTPOOL 1" + zinject | grep -q zfs_test_no_such_fn || \ + log_fail "panic function was not registered" + zinject -c all +} + +function test_panic_fault_neg +{ + log_mustnot eval "zinject -p f -d $DISK1 $TESTPOOL" + log_mustnot eval "zinject -p f -t data $TESTFILE" + log_mustnot eval "zinject -p f -f 50 $TESTPOOL" + zinject -c all +} + +# Each registered device/delay/data handler row ends with "match inject". +function inject_count +{ + zinject | awk '/^ *[0-9]/{print $NF}' | head -n 1 +} + +function verify_injection +{ + typeset cnt + + log_must zfs set primarycache=none $TESTPOOL + log_must dd if=/dev/urandom of=$TESTFILE bs=128k count=1 + log_must zpool sync $TESTPOOL + + log_must eval "zinject -d $DISK1 -e io -T read -f 100 $TESTPOOL" + dd if=$TESTFILE of=/dev/null bs=128k count=1 >/dev/null 2>&1 || true + cnt=$(inject_count) + [[ -n $cnt && $cnt -gt 0 ]] || \ + log_fail "device-fault injection did not execute (inject=$cnt)" + zinject -c all + + log_must eval "zinject -t data -e checksum -f 100 $TESTFILE" + dd if=$TESTFILE of=/dev/null bs=128k count=1 >/dev/null 2>&1 || true + cnt=$(inject_count) + [[ -n $cnt && $cnt -gt 0 ]] || \ + log_fail "object-fault injection did not execute (inject=$cnt)" + zinject -c all + + log_must eval "zinject -d $DISK1 -D 5:1 -T write $TESTPOOL" + log_must dd if=/dev/urandom of=$TESTFILE bs=128k count=1 + log_must zpool sync $TESTPOOL + cnt=$(inject_count) + [[ -n $cnt && $cnt -gt 0 ]] || \ + log_fail "delay injection did not execute (inject=$cnt)" + zinject -c all + + log_must zfs inherit primarycache $TESTPOOL +} + default_mirror_setup_noexit $DISKS test_device_fault +test_device_fault_neg +test_label_fault +test_label_fault_neg +test_object_fault +test_object_fault_neg +test_delay_fault +test_delay_fault_neg +test_panic_fault +test_panic_fault_neg +verify_injection log_pass "zinject parameters work as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh index 530661a686a..92c97aacd84 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh @@ -82,7 +82,7 @@ log_must zpool reopen $TESTPOOL1 typeset expandsize=$(get_pool_prop expandsize $TESTPOOL1) log_note "pool expandsize: $expandsize" -if [[ "$zpool_expandsize" = "-" ]]; then +if [[ "$expandsize" = "-" ]]; then log_fail "pool $TESTPOOL1 did not detect any " \ "expandsize after reopen" fi diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index 79992227169..be17821ba1a 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -66,6 +66,7 @@ typeset -a properties=( trim_bytes removing allocating + rotational failfast checksum_n checksum_t diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh index 60088e6dd97..be3344326e9 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -114,7 +114,7 @@ wait parallel_time=$SECONDS log_note "asyncronously imported 4 pools in $parallel_time seconds" -log_must test $parallel_time -lt $(($sequential_time / 2)) +log_must test $parallel_time -lt $(($sequential_time * 3 / 4)) # # export pools with import delay injectors @@ -133,6 +133,6 @@ log_must zpool import -a -d $DEVICE_DIR -f parallel_time=$SECONDS log_note "asyncronously imported 4 pools in $parallel_time seconds" -log_must test $parallel_time -lt $(($sequential_time / 2)) +log_must test $parallel_time -lt $(($sequential_time * 3 / 4)) log_pass "Pool imports occur in parallel" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh new file mode 100755 index 00000000000..2694e3278d9 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_inherit.ksh @@ -0,0 +1,115 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2026, Klara, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# +# zpool set can set the failfast property to 'inherit' +# +# STRATEGY: +# 1. Create a pool +# 2. Verify that we can set 'failfast' to various values, including inherit +# 3. Verify that the root vdev cannot be set to inherit +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL1 + rm -f $FILEVDEV1 $FILEVDEV2 $FILEVDEV3 +} + +function get_failfast +{ + zpool get -H -o value failfast $TESTPOOL1 $@ +} + +log_onexit cleanup + +log_assert "zpool set can configure 'failfast' property to inherit" +FILEVDEV1="$TEST_BASE_DIR/zpool_set_inherit1.$$.dat" +FILEVDEV2="$TEST_BASE_DIR/zpool_set_inherit2.$$.dat" +FILEVDEV3="$TEST_BASE_DIR/zpool_set_inherit3.$$.dat" + +log_must truncate -s $MINVDEVSIZE $FILEVDEV1 +log_must truncate -s $MINVDEVSIZE $FILEVDEV2 +log_must truncate -s $MINVDEVSIZE $FILEVDEV3 + +log_must zpool create -f $TESTPOOL1 $FILEVDEV1 mirror $FILEVDEV2 $FILEVDEV3 +failfast=$(get_failfast $FILEVDEV1) +[[ "$failfast" == "inherit" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=on $TESTPOOL1 $FILEVDEV1 +failfast=$(get_failfast $FILEVDEV1) +[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=off $TESTPOOL1 $FILEVDEV1 +failfast=$(get_failfast $FILEVDEV1) +[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=inherit $TESTPOOL1 $FILEVDEV1 + +failfast=$(get_failfast $FILEVDEV2) +[[ "$failfast" == "inherit" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=on $TESTPOOL1 $FILEVDEV2 +failfast=$(get_failfast $FILEVDEV2) +[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=off $TESTPOOL1 $FILEVDEV2 +failfast=$(get_failfast $FILEVDEV2) +[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=inherit $TESTPOOL1 $FILEVDEV2 + +failfast=$(get_failfast mirror-1) +[[ "$failfast" == "inherit" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=on $TESTPOOL1 mirror-1 +failfast=$(get_failfast mirror-1) +[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=off $TESTPOOL1 mirror-1 +failfast=$(get_failfast mirror-1) +[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=inherit $TESTPOOL1 mirror-1 + +failfast=$(get_failfast root) +[[ "$failfast" == "on" ]] || log_fail "incorrect failfast value: $failfast" + +log_must zpool set failfast=off $TESTPOOL1 root +failfast=$(get_failfast root) +[[ "$failfast" == "off" ]] || log_fail "incorrect failfast value: $failfast" + +log_mustnot zpool set failfast=inherit $TESTPOOL1 root + + +log_pass "zpool set can configure 'failfast' property to inherit" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh index e37f8e44c1a..a36649bc263 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh @@ -73,7 +73,7 @@ log_must mkdir "$TESTDIR" log_must truncate -s $LARGESIZE "$LARGEFILE" log_must zpool create -O compression=off $TESTPOOL "$LARGEFILE" log_must mkfile $(( floor(LARGESIZE * 0.80) )) /$TESTPOOL/file -sync_all_pools +sync_pool $TESTPOOL new_size=$(du -k "$LARGEFILE" | awk '{print $1 * 1024}') log_must test $new_size -le $LARGESIZE @@ -93,12 +93,8 @@ log_must test $new_size -gt $((4 * floor(LARGESIZE * 0.70) )) # Perform a partial trim, we expect it to skip most of the new metaslabs # which have never been used and therefore do not need be trimmed. log_must set_tunable64 TRIM_METASLAB_SKIP 1 -log_must zpool trim $TESTPOOL -log_must set_tunable64 TRIM_METASLAB_SKIP 0 - -while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do - sleep 0.5 -done +log_must zpool trim -w $TESTPOOL +sync_pool $TESTPOOL true new_size=$(du -k "$LARGEFILE" | awk '{print $1 * 1024}') log_must test $new_size -gt $LARGESIZE @@ -106,11 +102,9 @@ log_must test $new_size -gt $LARGESIZE # Perform a full trim, all metaslabs will be trimmed the pool vdev # size will be reduced but not down to its original size due to the # space usage of the new metaslabs. -log_must zpool trim $TESTPOOL - -while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do - sleep 0.5 -done +log_must set_tunable64 TRIM_METASLAB_SKIP 0 +log_must zpool trim -w $TESTPOOL +sync_pool $TESTPOOL true new_size=$(du -k "$LARGEFILE" | awk '{print $1 * 1024}') log_must test $new_size -le $(( 2 * LARGESIZE)) diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh index a2b3464b2bf..b1c12f1306a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh @@ -38,9 +38,8 @@ # # STRATEGY: # 1. Set the interval to 1 and count to 4. -# 2. Sleep for 5 seconds. -# 3. Verify that the output has 4 records. -# 4. Set interval to 0.5 and count to 1 to test floating point intervals. +# 2. Verify that the output has 4 records. +# 3. Set interval to 0.5 and count to 1 to test floating point intervals. verify_runnable "both" @@ -61,8 +60,7 @@ if ! is_global_zone ; then TESTPOOL=${TESTPOOL%%/*} fi -log_must eval "zpool iostat $TESTPOOL 1 4 > $tmpfile 2>&1 &" -log_must sleep 5 +log_must eval "zpool iostat $TESTPOOL 1 4 > $tmpfile 2>&1" stat_count=$(grep -c $TESTPOOL $tmpfile) if [[ $stat_count -ne 4 ]]; then diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh new file mode 100755 index 00000000000..57f54d93ad4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_bclone.ksh @@ -0,0 +1,120 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, TrueNAS. +# + +# +# DESCRIPTION: +# Verify that block cloning interacts correctly with dedup when the DDT +# entry for the block is still present. In this case brt_pending_apply_vdev() +# calls ddt_addref() which succeeds, so the extra reference is tracked in +# the DDT rather than in the BRT. +# +# STRATEGY: +# 1. Create a pool with block_cloning enabled and dedup=on +# 2. Write a file (4 blocks, unique DDT entries, refcnt=1) +# 3. Clone the file - ddt_addref() bumps DDT refcnt to 2, entries move +# from unique to duplicate table; no BRT entries are created +# 4. Write a third copy via dd - DDT refcnt becomes 3 +# 5. Delete files in sequence, verifying DDT counts and zdb -b at each step +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Block cloning with live DDT entries uses ddt_addref, not BRT" + +# Flush DDT log every TXG so entries appear in the ZAP immediately. +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + log_must restore_tunable DEDUP_LOG_TXG_MAX +} + +log_onexit cleanup + +# we disable compression so our writes create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@block_cloning=enabled \ + -O dedup=on \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS +typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +# Write unique data: 4 blocks, each gets a DDT entry with refcnt=1. +log_must dd if=/dev/urandom of=$mountpoint/file1 bs=128k count=4 +sync_pool $TESTPOOL + +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" + +# Clone file1. The extra reference goes into the DDT rather than the BRT. +# The entries move from unique (refcnt=1) to duplicate (refcnt=2). +log_must clonefile -f $mountpoint/file1 $mountpoint/clone1 +sync_pool $TESTPOOL + +log_must eval \ + "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'" +log_must zdb -b $TESTPOOL + +# Write a third copy via dd — DDT refcnt becomes 3. +log_must dd if=$mountpoint/file1 of=$mountpoint/file2 bs=128k +sync_pool $TESTPOOL + +log_must eval \ + "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'" +log_must zdb -b $TESTPOOL + +# Delete the clone — DDT refcnt drops to 2, still duplicate. +log_must rm $mountpoint/clone1 +sync_pool $TESTPOOL + +log_must eval \ + "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate:.*entries=4'" +log_must zdb -b $TESTPOOL + +# Delete file2 — DDT refcnt drops to 1, entries move back to unique. +log_must rm $mountpoint/file2 +sync_pool $TESTPOOL + +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" +log_must zdb -b $TESTPOOL + +# Delete the original — DDT empty, blocks freed. +log_must rm $mountpoint/file1 +sync_pool $TESTPOOL + +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" +log_must zdb -b $TESTPOOL + +log_pass "Block cloning with live DDT entries uses ddt_addref, not BRT" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh new file mode 100755 index 00000000000..d01d09ac12e --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_bclone_pruned.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, TrueNAS. +# + +# +# DESCRIPTION: +# Verify that block cloning works correctly when the DDT entry for a +# dedup block has been pruned. When a block has the DEDUP bit set but +# no DDT entry (because it was pruned), cloning it must create a BRT +# entry to track the extra reference. Freeing the original must +# consult the BRT rather than proceeding directly to a DVA free, +# otherwise the block is freed while the clone still references it. +# +# STRATEGY: +# 1. Create a pool with both dedup and block_cloning enabled +# 2. Write a file with dedup=on so blocks get DEDUP bit set in their BPs +# 3. Prune the DDT to remove those entries (blocks remain, DEDUP bit +# stays set in block pointers) +# 4. Clone the file - brt_pending_apply_vdev() must fall back to BRT +# since ddt_addref() returns B_FALSE for pruned entries +# 5. Write a second copy via dd - same hash, new physical blocks, new +# DDT entries at different DVAs from the BRT-tracked blocks +# 6. Delete the clone first - must go through BRT, not DDT, even though +# a matching DDT entry now exists for the same hash +# 7. Delete the dd copy - DDT entries freed normally +# 8. Delete the original - no DDT entry, no BRT entry, DVA freed +# 9. Verify reference counts with zdb -b at each step +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Block cloning of dedup blocks with pruned DDT entries uses BRT" + +# Flush DDT log every TXG so entries appear in the ZAP immediately, +# making ddtprune effective and test behavior predictable. +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000 + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + log_must restore_tunable DEDUP_LOG_TXG_MAX + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +} + +log_onexit cleanup + +log_must zpool create -f -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must zfs create -o dedup=sha256 -o recordsize=128k $TESTPOOL/$TESTFS +typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +# Write unique data: each block gets a DDT entry with refcnt=1. +log_must dd if=/dev/urandom of=$mountpoint/file1 bs=128k count=8 + +sync_pool $TESTPOOL + +# Verify DDT has entries before pruning. +typeset entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +log_must test "$entries" -eq 8 + +# Sleep 1s so the DDT entries are at least 1 second old. ddtprune uses +# an age-based cutoff and will silently skip entries that are too fresh. +sleep 1 + +# Prune all unique (refcnt=1) entries. The blocks remain on disk and the +# block pointers in file1 still have the DEDUP bit set, but there is no +# longer a DDT entry for them. +log_must zpool ddtprune -p 100 $TESTPOOL +sync_pool $TESTPOOL + +# Confirm the prune actually removed all entries. +entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +[[ -z "$entries" || "$entries" -eq 0 ]] || \ + log_fail "DDT entries not pruned: $entries remain" + +# Clone file1. brt_pending_apply_vdev() will see the DEDUP bit, call +# ddt_addref(), receive B_FALSE (no DDT entry), and fall through to +# create BRT entries instead. +log_must clonefile -f $mountpoint/file1 $mountpoint/clone1 +sync_pool $TESTPOOL + +# BRT entries exist; reference counts must be consistent. +log_must zdb -b $TESTPOOL + +# Write a second copy via dd. Since the DDT was pruned, dedup can't find +# an existing entry and writes new physical blocks at new DVAs, creating +# fresh DDT entries with refcnt=1. The BRT-tracked blocks (file1/clone1) +# are at the old DVAs and are unaffected. +log_must dd if=$mountpoint/file1 of=$mountpoint/file2 bs=128k +sync_pool $TESTPOOL + +# Eight new unique DDT entries (file2's blocks); BRT still holds refs for +# file1/clone1's old blocks. +typeset entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +log_must test "$entries" -eq 8 +log_must zdb -b $TESTPOOL + +# Delete the clone first. Its blocks carry the DEDUP bit and the same +# hash as file2's DDT entries, but the DVAs differ — the free must go +# through BRT, not DDT, leaving file2's DDT entries intact. +log_must rm $mountpoint/clone1 +sync_pool $TESTPOOL + +entries=$(zpool status -D $TESTPOOL | \ + grep "dedup: DDT entries" | awk '{print $4}') +log_must test "$entries" -eq 8 +log_must zdb -b $TESTPOOL + +# Delete file2. DDT entries freed; file1's BRT-tracked blocks unaffected. +log_must rm $mountpoint/file2 +sync_pool $TESTPOOL +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" +log_must zdb -b $TESTPOOL + +# Delete the original. No DDT entry, no BRT entry; DVA freed directly. +log_must rm $mountpoint/file1 +sync_pool $TESTPOOL +log_must zdb -b $TESTPOOL + +log_pass "Block cloning of dedup blocks with pruned DDT entries uses BRT" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh index 1a82e5d30a1..11e2461d936 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh @@ -44,14 +44,12 @@ function cleanup log_onexit cleanup -# create a pool with fast dedup enabled. we disable block cloning to ensure -# it doesn't get in the way of dedup, and we disable compression so our writes +# create a pool with fast dedup enabled. we disable compression so our writes # create predictable results on disk # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting log_must zpool create -f \ -o feature@fast_dedup=enabled \ -O dedup=on \ - -o feature@block_cloning=disabled \ -O compression=off \ -O xattr=sa \ $TESTPOOL $DISKS @@ -81,7 +79,7 @@ obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 # copy the file -log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128k log_must zpool sync # now four entries in the duplicate table @@ -104,4 +102,6 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" # logical table now destroyed; containing object destroyed log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0 +log_must zdb -b $TESTPOOL + log_pass "basic dedup (FDT) operations work" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh index 5f6eb7c3400..1885daf4489 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh @@ -44,14 +44,12 @@ function cleanup log_onexit cleanup -# create a pool with fast dedup enabled. we disable block cloning to ensure -# it doesn't get in the way of dedup, and we disable compression so our writes +# create a pool with fast dedup enabled. we disable compression so our writes # create predictable results on disk # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting log_must zpool create -f \ -o feature@fast_dedup=enabled \ -O dedup=on \ - -o feature@block_cloning=disabled \ -O compression=off \ -O xattr=sa \ $TESTPOOL $DISKS @@ -117,4 +115,6 @@ obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') # with only one ZAP inside log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 +log_must zdb -b $TESTPOOL + log_pass "dedup (FDT) retains version after import" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh index 8028e4f0884..2bebed6965f 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh @@ -46,11 +46,9 @@ function cleanup log_onexit cleanup -# Create a pool with fast dedup enabled. We disable block cloning to ensure -# it doesn't get in the way of dedup. +# Create a pool with fast dedup enabled. log_must zpool create -f \ -o feature@fast_dedup=enabled \ - -o feature@block_cloning=disabled \ $TESTPOOL $DISKS # Create a filesystem with a small recordsize so that we get more DDT entries, @@ -107,4 +105,6 @@ log_entries3=$(get_ddt_log_entries) # Verify there are 256 entries in the unique table. log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=256'" +log_must zdb -b $TESTPOOL + log_pass "dedup (FDT) paces out log entries appropriately" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh index 3348614cb74..cc9a8694724 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh @@ -37,14 +37,12 @@ function cleanup log_onexit cleanup -# create a pool with legacy dedup enabled. we disable block cloning to ensure -# it doesn't get in the way of dedup, and we disable compression so our writes +# create a pool with legacy dedup enabled. we disable compression so our writes # create predictable results on disk # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting log_must zpool create -f \ -o feature@fast_dedup=disabled \ -O dedup=on \ - -o feature@block_cloning=disabled \ -O compression=off \ -O xattr=sa \ $TESTPOOL $DISKS @@ -70,7 +68,7 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 # copy the file -log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128k log_must zpool sync # now four entries in the duplicate table @@ -93,4 +91,6 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" # logical table now destroyed; all DDT ZAPs removed log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 +log_must zdb -b $TESTPOOL + log_pass "basic dedup (legacy) operations work" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh index c962efaa7c5..03acaf09b39 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh @@ -45,13 +45,11 @@ function cleanup log_onexit cleanup -# create a pool with legacy dedup enabled. we disable block cloning to ensure -# it doesn't get in the way of dedup, and we disable compression so our writes +# create a pool with legacy dedup enabled. we disable compression so our writes # create predictable results on disk # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting log_must zpool create -f \ -o feature@fast_dedup=disabled \ - -o feature@block_cloning=disabled \ -O compression=off \ -O xattr=sa \ $TESTPOOL $DISKS @@ -102,4 +100,6 @@ log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | wc -l) -eq 1 obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | awk '{ print $NF }') log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-.*-zap- | wc -l) -eq 1 +log_must zdb -b $TESTPOOL + log_pass "legacy and FDT dedup tables on the same pool can happily coexist" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh index 94f009fc0d0..2b610af1ebf 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh @@ -45,14 +45,12 @@ function cleanup log_onexit cleanup -# create a pool with legacy dedup enabled. we disable block cloning to ensure -# it doesn't get in the way of dedup, and we disable compression so our writes +# create a pool with legacy dedup enabled. we disable compression so our writes # create predictable results on disk # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting log_must zpool create -f \ -o feature@fast_dedup=disabled \ -O dedup=on \ - -o feature@block_cloning=disabled \ -O compression=off \ -O xattr=sa \ $TESTPOOL $DISKS @@ -84,7 +82,7 @@ log_must zpool set feature@fast_dedup=enabled $TESTPOOL log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" # copy the file -log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128k log_must zpool sync # feature should still be enabled @@ -127,4 +125,6 @@ obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') # with one ZAP inside log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 +log_must zdb -b $TESTPOOL + log_pass "legacy dedup tables work after upgrade; new dedup tables created as FDT" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh index 9f6b1ef12a9..c137f7b9499 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh @@ -37,14 +37,12 @@ function cleanup log_onexit cleanup -# create a pool with legacy dedup enabled. we disable block cloning to ensure -# it doesn't get in the way of dedup, and we disable compression so our writes +# create a pool with legacy dedup enabled. we disable compression so our writes # create predictable results on disk # Use 'xattr=sa' to prevent selinux xattrs influencing our accounting log_must zpool create -f \ -o feature@fast_dedup=disabled \ -O dedup=on \ - -o feature@block_cloning=disabled \ -O compression=off \ -O xattr=sa \ $TESTPOOL $DISKS @@ -102,4 +100,6 @@ log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=4'" # should be just one DDT ZAP in the MOS log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 +log_must zdb -b $TESTPOOL + log_pass "dedup (legacy) retains version after import" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh index 6b4937cc4a2..d80fbe9795d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh @@ -69,12 +69,12 @@ function ddt_entries log_onexit cleanup -log_must zpool create -f -o feature@block_cloning=disabled $TESTPOOL $DISKS +log_must zpool create -f $TESTPOOL $DISKS log_must zfs create -o recordsize=512 -o dedup=on $TESTPOOL/$TESTFS typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) log_must dd if=/dev/urandom of=$mountpoint/f1 bs=512k count=1 -log_must cp $mountpoint/f1 $mountpoint/f2 +log_must dd if=$mountpoint/f1 of=$mountpoint/f2 bs=512k sync_pool $TESTPOOL entries=$(ddt_entries) log_note "ddt entries before: $entries" @@ -95,5 +95,6 @@ new_entries=$(ddt_entries) [[ "$((entries / 4))" -eq "$new_entries" ]] || \ log_fail "DDT entries did not shrink enough: $entries -> $new_entries" +log_must zdb -b $TESTPOOL log_pass "DDT pruning correctly removes non-duplicate entries" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh index 597bad253ec..41586204333 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh @@ -83,4 +83,6 @@ log_must zpool import $TESTPOOL nleafs=$(zdb -dddd $TESTPOOL "$zap_obj" | grep "Leaf blocks:" | awk -F\: '{print($2);}') log_must test $nleafs -lt $nleafs_old +log_must zdb -b $TESTPOOL + log_pass "ZAP object shrank after removing entries." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/devices/devices_common.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/devices/devices_common.kshlib index 8024067ac9e..3298b49fec7 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/devices/devices_common.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/devices/devices_common.kshlib @@ -54,9 +54,9 @@ function create_dev_file # %t - major device type in hex # %T - minor device type in hex # - major=$(stat --dereference --format="%t" "$devstr") - minor=$(stat --dereference --format="%T" "$devstr") - log_must mknod $filename b "0x${major}" "0x${minor}" + major=$(printf '%d' 0x$(stat -L -c "%t" "$devstr")) + minor=$(printf '%d' 0x$(stat -L -c "%T" "$devstr")) + log_must mknod $filename b "${major}" "${minor}" ;; *) # @@ -83,9 +83,9 @@ function create_dev_file # %t - major device type in hex # %T - minor device type in hex # - major=$(stat --format="%t" /dev/null) - minor=$(stat --format="%T" /dev/null) - log_must mknod $filename c "0x${major}" "0x${minor}" + major=$(printf '%d' 0x$(stat -c "%t" /dev/null)) + minor=$(printf '%d' 0x$(stat -c "%T" /dev/null)) + log_must mknod $filename c "${major}" "${minor}" ;; FreeBSD) # diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh index 023f5b58a6e..529a6a8c3fe 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh @@ -84,8 +84,8 @@ for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ spare $SPARE_DEV1 - SPARE1=$SPARE_DEV1 - SPARE2="draid2-0-0" + SPARE1="draid2-0-0" + SPARE2=$SPARE_DEV1 elif [ "$type" = "mirror" ]; then # 1. Create a 3-way mirror pool with two hot spares truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS @@ -167,8 +167,8 @@ for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ spare $SPARE_DEV1 - SPARE1=$SPARE_DEV1 - SPARE2="draid2-0-0" + SPARE1="draid2-0-0" + SPARE2=$SPARE_DEV1 elif [ "$type" = "mirror" ]; then # 1. Create a 3-way mirror pool with two hot spares truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh new file mode 100755 index 00000000000..5378979a8bb --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_rotational.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# ZED prefers the smallest sufficient spare when replacing a faulted +# special vdev, regardless of spare list order. +# +# The 'rotational' property is persisted in the pool config for all leaf +# vdevs so that spare selection can match device type even after the +# original disk is gone. ZED sorts spares preferring matching rotational +# and, among equally-matching spares, the smallest sufficient one. +# +# STRATEGY: +# 1. Create a pool with a normal mirror, a special mirror, and two file +# spares of different sizes. List the larger spare first so that the +# sorted order contradicts the list order. +# 2. Fault a member of the special mirror; verify ZED activates the +# smaller sufficient spare, leaving the larger spare available. +# + +verify_runnable "both" + +NORM1="$TEST_BASE_DIR/rotational-norm1" +NORM2="$TEST_BASE_DIR/rotational-norm2" +SPEC1="$TEST_BASE_DIR/rotational-spec1" +SPEC2="$TEST_BASE_DIR/rotational-spec2" +SPARE_SMALL="$TEST_BASE_DIR/rotational-spare-small" +SPARE_LARGE="$TEST_BASE_DIR/rotational-spare-large" + +LARGE_SIZE=$((MINVDEVSIZE * 2)) + +function cleanup +{ + log_must zinject -c all + destroy_pool $TESTPOOL + rm -f $NORM1 $NORM2 $SPEC1 $SPEC2 $SPARE_SMALL $SPARE_LARGE +} + +log_assert "ZED selects smallest sufficient spare for a faulted special vdev" +log_onexit cleanup + +zed_events_drain + +log_must truncate -s $MINVDEVSIZE $NORM1 $NORM2 $SPEC1 $SPEC2 $SPARE_SMALL +log_must truncate -s $LARGE_SIZE $SPARE_LARGE + +# SPARE_LARGE is listed first so that size-preference sorting is what +# causes SPARE_SMALL to be selected, not merely list order. +log_must zpool create -f $TESTPOOL \ + mirror $NORM1 $NORM2 \ + special mirror $SPEC1 $SPEC2 \ + spare $SPARE_LARGE $SPARE_SMALL + +log_must zinject -d $SPEC1 -e io -T all -f 100 $TESTPOOL +log_must zpool scrub $TESTPOOL + +log_note "Wait for ZED to auto-spare the special vdev" +log_must wait_vdev_state $TESTPOOL $SPEC1 "FAULTED" 60 +log_must wait_hotspare_state $TESTPOOL $SPARE_SMALL "INUSE" + +# The larger spare must not have been activated. +log_must wait_hotspare_state $TESTPOOL $SPARE_LARGE "AVAIL" + +log_must check_state $TESTPOOL "" "DEGRADED" + +log_pass "ZED activated the smallest sufficient spare for the special vdev" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 2b5a28b0620..ae8a4b2a648 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -435,32 +435,38 @@ function verify_draid_pool log_note "verify_draid_pool $pool $replace_mode" log_must zpool scrub -w $pool + sync_pool $pool true - typeset -i cksum=$(cksum_pool $pool) + typeset status=$(zpool status -p $pool) + typeset -i cksum=$(echo "$status" | awk ' + !NF { isvdev = 0 } + isvdev { errors += $NF } + /CKSUM$/ { isvdev = 1 } + END { print errors }') if [[ "$replace_mode" = "healing" ]]; then if [[ $cksum -gt 0 ]]; then - log_must zpool status -v $pool + log_note "$status" log_fail "Unexpected CKSUM errors found for $pool ($cksum)" fi if ! check_pool_status $pool "scan" "repaired 0B"; then - log_must zpool status -v $pool + log_note "$status" log_fail "Unexpected repair IO found for $pool ($cksum)" fi elif [[ "$replace_mode" = "sequential" ]]; then if [[ $cksum -gt 0 ]]; then - log_must zpool status -v $pool + log_note "$status" log_fail "Unexpected CKSUM errors found for $pool ($cksum)" fi elif [[ "$replace_mode" = "damaged" ]]; then if [[ $cksum -lt 1 ]]; then - log_must zpool status -v $pool + log_note "$status" log_fail "Expected CKSUM errors missing for $pool ($cksum)" fi if check_pool_status $pool "scan" "repaired 0B"; then - log_must zpool status -v $pool + log_note "$status" log_fail "Expected repair IO missing for $pool ($cksum)" fi else @@ -468,12 +474,12 @@ function verify_draid_pool fi if ! check_pool_status $pool "scan" "with 0 errors"; then - log_must zpool status -v $pool + log_note "$status" log_fail "Unexpected repair errors found for $pool" fi if ! check_pool_status $pool "errors" "No known data errors"; then - log_must zpool status -v $pool + log_note "$status" log_fail "Unexpected data errors found for $pool" fi } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh index b94841aed15..cdd2d201e1a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/removal_with_export.ksh @@ -29,7 +29,7 @@ log_onexit default_cleanup_noexit function callback { test_removal_with_operation_kill - log_must zpool export $TESTPOOL + log_must_busy zpool export $TESTPOOL # # We are concurrently starting dd processes that will diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh new file mode 100755 index 00000000000..8261885e651 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/cleanup.ksh @@ -0,0 +1,27 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +destroy_pool $POOL +destroy_pool $POOL2 + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg new file mode 100644 index 00000000000..e4999a3ca29 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +read -r DISK1 DISK2 _ <<<"$DISKS" +export DISK1 DISK2 + +export POOL=$TESTPOOL +export POOL2=$TESTPOOL2 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib new file mode 100644 index 00000000000..8e36b748439 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib @@ -0,0 +1,71 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.cfg + +# +# Verify that the DRR_BEGIN records in the given send stream encode their +# nvlist payloads with NV_ENCODE_XDR (and not NV_ENCODE_NATIVE). +# +# DRR_BEGIN records that carry an nvlist payload (raw sends, redacted sends, +# resumed sends, and combinations thereof) must encode that payload with +# NV_ENCODE_XDR so the resulting stream can be portably consumed across +# endianness. Encoding the payload with NV_ENCODE_NATIVE produces a stream +# that is unreadable on a receiver of the opposite endianness. +# +# zstream dump prints a single "nvlist encoding = ..." line per DRR_BEGIN +# record that carries an nvlist payload. The possible values are: +# +# NV_ENCODE_XDR +# NV_ENCODE_NATIVE (big-endian) +# NV_ENCODE_NATIVE (little-endian) +# +# Every test in this suite generates a stream whose DRR_BEGIN record +# carries an nvlist payload, so the pass criterion is: +# +# - At least one NV_ENCODE_XDR line appears, AND +# - No NV_ENCODE_NATIVE line appears. +# +# Requiring at least one XDR line catches the case where zstream dump +# itself fails before producing any encoding output. Asserting on dump +# content rather than dump exit status means a partial dump can still +# fail the test on an NV_ENCODE_NATIVE seen before the failure point. +# +function verify_xdr_nvlist_encoding +{ + typeset stream=$1 + typeset out + + [[ -f "$stream" ]] || \ + log_fail "verify_xdr_nvlist_encoding: stream not found: $stream" + + out=$(zstream dump "$stream" 2>/dev/null) + + if echo "$out" | grep -q 'NV_ENCODE_NATIVE'; then + log_fail "verify_xdr_nvlist_encoding: " \ + "NV_ENCODE_NATIVE found in $stream" + fi + if ! echo "$out" | grep -q 'NV_ENCODE_XDR'; then + log_fail "verify_xdr_nvlist_encoding: " \ + "no NV_ENCODE_XDR found in $stream" + fi +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh new file mode 100755 index 00000000000..609acba3a22 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/setup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +verify_disk_count "$DISKS" 2 + +create_pool $POOL $DISK1 +create_pool $POOL2 $DISK2 + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh new file mode 100755 index 00000000000..9ba10d9e605 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A raw incremental send from a redaction bookmark on an encrypted dataset +# (zfs send -w -i ds#book ds@snap) carries both BEGINNV_REDACT_FROM_SNAPS +# and crypt_keydata in its DRR_BEGIN nvlist payload. Verify that this +# combined payload is XDR-encoded and the stream can be received. +# +# Strategy: +# 1. Create an encrypted source dataset with a redaction bookmark and a +# later snapshot. +# 2. Establish a raw base on the receiver via zfs send -w of the bookmark's +# source snapshot. +# 3. zfs send -w -i sendfs#book sendfs@s1 to a file. +# 4. Verify that the resulting stream is XDR-encoded. +# 5. Verify that the zfs receive succeeds. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_bookmark_raw_src" +clonefs="$POOL/xdr_bookmark_raw_clone" +recvfs="$POOL2/xdr_bookmark_raw_recv" +keyfile="/$POOL/xdr_bookmark_raw.key" +full_stream="/$POOL/xdr_bookmark_raw_full.zsend" +incr_stream="/$POOL/xdr_bookmark_raw_incr.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $keyfile $full_stream $incr_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a raw incremental from a redaction bookmark is " \ + "XDR-encoded and receivable" + +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $sendfs + +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +# The clone inherits encryption from $sendfs. +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Take @s1 with no intervening writes. See xdr_bookmark_raw_with_write.ksh +# for a variant that includes a post-redact write; that variant exercises +# a known kernel-side issue (#18491) and may flake. +log_must zfs snapshot $sendfs@s1 + +# Establish a raw base on the receiver. +log_must eval "zfs send -w $sendfs@s0 > $full_stream" +log_must eval "zfs receive $recvfs < $full_stream" + +# Raw incremental from the redaction bookmark. This is the test focus. +log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \ + $incr_stream" +verify_xdr_nvlist_encoding $incr_stream +log_must eval "zfs receive $recvfs < $incr_stream" + +log_pass "BEGIN nvlist of a raw incremental from a redaction bookmark is " \ + "XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh new file mode 100755 index 00000000000..c58735f04d4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_bookmark_raw_with_write.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# This is the post-redact-write variant of xdr_bookmark_raw, separated out +# because of a known issue (#18491) that causes it to fail roughly 30% of +# the time. It's included here as a test for issue #18491 until the exact +# source of that problem can be pinned down more specifically. +# +# Known issue: openzfs/zfs#18491 +# +# On a freshly-created pool, `zfs send -w -i ds#book ds@snap` intermittently +# fails with EACCES whenever there is data-modifying activity between the +# `zfs redact` that created the bookmark and the subsequent send. This EACCES +# is surfaced to userspace as the misleading message "dataset key must be +# loaded," although the key remains loaded throughout. +# +# The reproducer script included in the issue report typically triggers the +# problem within about 10 iterations on a fresh pool. Disk-sync mitigations +# (zpool sync, with or without `-f`, with or without sleep, single or doubled, +# applied at any reasonable point) do not avert the problem. CI runs that +# include the test in this file reproduce the failure regularly (though +# intermittently) across multiple distributions. xdr_resume_bookmark_raw.ksh +# removes the post-redact write (which is not essential to the test) and +# therefore runs reliably. +# +# When this test fails, the failure marker is the libzfs warning +# "dataset key must be loaded" on stderr from the first `zfs send -w -i` +# line below (the one that produces the stream we then truncate), and a +# non-zero exit from that send. The test does not attempt to distinguish +# the known-issue failure from other possible failures. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_bookmark_raw_with_write_src" +clonefs="$POOL/xdr_bookmark_raw_with_write_clone" +recvfs="$POOL2/xdr_bookmark_raw_with_write_recv" +keyfile="/$POOL/xdr_bookmark_raw_with_write.key" +full_stream="/$POOL/xdr_bookmark_raw_with_write_full.zsend" +incr_stream="/$POOL/xdr_bookmark_raw_with_write_incr.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $keyfile $full_stream $incr_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a raw incremental from a redaction bookmark, " \ + "with a post-redact write, is XDR-encoded and receivable " \ + "(known to flake; see openzfs/zfs#18491)" + +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $sendfs + +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +# The clone inherits encryption from $sendfs. +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Post-redact write: the trigger for openzfs/zfs#18491. +log_must dd if=/dev/urandom of=/$sendfs/f3 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s1 + +# Establish a raw base on the receiver. +log_must eval "zfs send -w $sendfs@s0 > $full_stream" +log_must eval "zfs receive $recvfs < $full_stream" + +# The next line is what races. On failure it exits with EACCES rendered +# as "dataset key must be loaded". +log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \ + $incr_stream" +verify_xdr_nvlist_encoding $incr_stream +log_must eval "zfs receive $recvfs < $incr_stream" + +log_pass "BEGIN nvlist of a raw incremental from a redaction bookmark, " \ + "with a post-redact write, is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh new file mode 100755 index 00000000000..ab04f6aa603 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_bookmark.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# An incremental send from a redaction bookmark (zfs send -i ds#book ds@snap) +# carries BEGINNV_REDACT_FROM_SNAPS in its DRR_BEGIN nvlist payload (via the +# from_rl path). Verify that this payload is XDR-encoded and the stream can +# be received. +# +# Strategy: +# 1. Create a source dataset with a redaction bookmark. +# 2. Send a redacted full stream from that bookmark's source snapshot +# and receive it into a second pool as a base. +# 3. Add data and a new snapshot on the source. +# 4. zfs send -i sendfs#redaction-bookmark sendfs@snap to a file. +# 5. Verify XDR encoding in the resulting stream. +# 6. Verify that zfs receive of the stream succeeds. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_incr_from_bookmark_src" +clonefs="$POOL/xdr_incr_from_bookmark_clone" +recvfs="$POOL2/xdr_incr_from_bookmark_recv" +full_stream="/$POOL/xdr_incr_from_bookmark_full.zsend" +incr_stream="/$POOL/xdr_incr_from_bookmark_incr.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $full_stream $incr_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of an incremental send from a redaction bookmark " \ + "is XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Establish a base on the receiver. +log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream" +log_must eval "zfs receive $recvfs < $full_stream" + +# Add a new snapshot on the source for the incremental. +log_must dd if=/dev/urandom of=/$sendfs/f3 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s1 + +# Generate an incremental send from the redaction bookmark. This fires +# BEGINNV_REDACT_FROM_SNAPS via the from_rl path because the from-side +# is a redaction bookmark. +log_must eval "zfs send -i $sendfs#redaction-bookmark $sendfs@s1 > $incr_stream" +verify_xdr_nvlist_encoding $incr_stream +log_must eval "zfs receive $recvfs < $incr_stream" + +log_pass "BEGIN nvlist of an incremental send from a redaction bookmark " \ + "is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh new file mode 100755 index 00000000000..fc4d34c4346 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_incr_from_redacted.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# An incremental send whose from-side is a snapshot of a previously-redacted +# dataset carries BEGINNV_REDACT_FROM_SNAPS in its DRR_BEGIN nvlist payload +# via a different code path than incrementals from a redaction bookmark +# (the dspp->numfromredactsnaps path). Verify that this payload is +# XDR-encoded and that the stream can be received. +# +# Strategy: +# 1. Produce a redacted dataset on a receiver via a redacted full send, +# leaving the receiver with a snapshot whose from-side will carry the +# SPA_FEATURE_REDACTED_DATASETS feature. +# 2. Establish the same base on a tertiary destination so we have somewhere +# to apply the incremental. +# 3. Create a new snapshot of the receiver-side redacted dataset. +# 4. zfs send -i mid@s0 mid@s1 to a file. +# 5. Verify that the stream is XDR encoded. +# 6. Verify that we can zfs receive the incremental onto the tertiary base. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_incr_from_redacted_src" +clonefs="$POOL/xdr_incr_from_redacted_clone" +midfs="$POOL2/xdr_incr_from_redacted_mid" +tertiary="$POOL/xdr_incr_from_redacted_tertiary" +full_stream="/$POOL/xdr_incr_from_redacted_full.zsend" +incr_stream="/$POOL/xdr_incr_from_redacted_incr.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $midfs && destroy_dataset $midfs -R + datasetexists $tertiary && destroy_dataset $tertiary -R + rm -f $full_stream $incr_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of an incremental from a previously-redacted " \ + "snapshot is XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Produce two receivers of the redacted full send: one we will re-send from +# (mid) and one we will receive the incremental into (tertiary). +log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream" +log_must eval "zfs receive $midfs < $full_stream" +log_must eval "zfs receive $tertiary < $full_stream" + +# Create a fresh snapshot of the redacted receiver. The data has not changed +# (and cannot be modified without mounting), but the snapshot itself is +# enough to drive an incremental send and trigger the case-4 nvlist path. +log_must zfs snapshot $midfs@s1 + +# Create an incremental send from the redacted from-side. This fires +# BEGINNV_REDACT_FROM_SNAPS via the dspp->numfromredactsnaps path because +# $midfs@s0 has the SPA_FEATURE_REDACTED_DATASETS feature active. +log_must eval "zfs send -i $midfs@s0 $midfs@s1 > $incr_stream" +verify_xdr_nvlist_encoding $incr_stream +log_must eval "zfs receive $tertiary < $incr_stream" + +log_pass "BEGIN nvlist of an incremental from a previously-redacted snapshot " \ + "is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh new file mode 100755 index 00000000000..c3a196650c6 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_raw.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A raw send of an encrypted dataset (zfs send -w) carries a "crypt_keydata" +# nested nvlist in its DRR_BEGIN nvlist payload. Verify that this payload is +# XDR-encoded and that the stream can be received. +# +# Strategy: +# 1. Create an encrypted dataset with one snapshot. +# 2. zfs send -w to a file. +# 3. Verify that the stream is XDR-encoded. +# 4. Verify that zfs receive succeeds. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_raw_src" +recvfs="$POOL2/xdr_raw_recv" +keyfile="/$POOL/xdr_raw.key" +stream="/$POOL/xdr_raw.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -r + datasetexists $recvfs && destroy_dataset $recvfs -r + rm -f $keyfile $stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a raw send of an encrypted dataset is " \ + "XDR-encoded and receivable" + +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s1 + +log_must eval "zfs send -w $sendfs@s1 > $stream" + +verify_xdr_nvlist_encoding $stream +log_must eval "zfs receive $recvfs < $stream" + +log_pass "BEGIN nvlist of a raw send of an encrypted dataset is " \ + "XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh new file mode 100755 index 00000000000..2bad9bebdaa --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_full.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A redacted send (zfs send --redact ) carries BEGINNV_REDACT_SNAPS +# in its DRR_BEGIN nvlist payload. Verify that this payload is XDR-encoded and +# the stream can be received. +# +# Strategy: +# 1. Create a source dataset and a divergent clone. +# 2. Create a redaction bookmark on the source snapshot relative to the +# clone snapshot. +# 3. zfs send --redact sendfs@snap to a file. +# 4. verify_xdr_nvlist_encoding on the stream. +# 5. Verify that zfs receive succeeds. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_redacted_full_src" +clonefs="$POOL/xdr_redacted_full_clone" +recvfs="$POOL2/xdr_redacted_full_recv" +stream="/$POOL/xdr_redacted_full.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a redacted send is XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $stream" +verify_xdr_nvlist_encoding $stream +log_must eval "zfs receive $recvfs < $stream" + +log_pass "BEGIN nvlist of a redacted send is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh new file mode 100755 index 00000000000..a18b1f40594 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# Sending a snapshot from a previously-redacted dataset (one with the +# SPA_FEATURE_REDACTED_DATASETS feature active, e.g., one that was received +# from a redacted send) carries BEGINNV_REDACT_SNAPS in its DRR_BEGIN +# nvlist payload via a different code path than the producer-side --redact +# flag. Verify that this payload is XDR-encoded and that the stream can be +# received. +# +# Strategy: +# 1. Produce a redacted dataset on a receiver via a redacted full send. +# 2. zfs send the received-redacted snapshot to a new dataset. +# 3. Verify XDR encoding on the new stream. +# 4. Verify that a zfs receive of the new stream succeeds. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_redacted_received_src" +clonefs="$POOL/xdr_redacted_received_clone" +midfs="$POOL2/xdr_redacted_received_mid" +recvfs="$POOL2/xdr_redacted_received_recv" +full_stream="/$POOL/xdr_redacted_received_full.zsend" +resend_stream="/$POOL/xdr_redacted_received_resend.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $midfs && destroy_dataset $midfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $full_stream $resend_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a send from a previously-redacted dataset is " \ + "XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Produce a previously-redacted dataset on the receiver. +log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream" +log_must eval "zfs receive $midfs < $full_stream" + +# Send the received-redacted snapshot. This fires BEGINNV_REDACT_SNAPS via +# the SPA_FEATURE_REDACTED_DATASETS code path on to_ds. +log_must eval "zfs send $midfs@s0 > $resend_stream" +verify_xdr_nvlist_encoding $resend_stream +log_must eval "zfs receive $recvfs < $resend_stream" + +log_pass "BEGIN nvlist of a send from a previously-redacted dataset is " \ + "XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh new file mode 100755 index 00000000000..2efcba32b9f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_redacted_received_raw.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# zfs send explicitly disallows the source-side combination of -w and +# --redact. However, the same nvlist combination (BEGINNV_REDACT_SNAPS +# together with crypt_keydata) can still be reached by: +# +# 1. Sending a redacted (non-raw) stream from an unencrypted source. +# 2. Receiving it with receiver-side encryption. +# 3. Re-sending the now-encrypted-and-redacted dataset with -w. +# +# The final stream's DRR_BEGIN nvlist contains both the redact-snaps array +# (via the SPA_FEATURE_REDACTED_DATASETS code path on to_ds) and +# crypt_keydata (via DMU_BACKUP_FEATURE_RAW). Verify that this combined +# payload is XDR-encoded and that the stream can be received. +# +# Strategy: +# 1. Create an unencrypted source dataset with a redaction bookmark. +# 2. zfs send --redact sendfs@snap to a file (no -w). +# 3. zfs receive into a new dataset with -o encryption=on (receiver-side +# encryption). +# 4. zfs send -w the received dataset to a second stream file. +# 5. Verify that this second stream is XDR-encoded. +# 6. Verify that the second stream can be zfs received successfully. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_redacted_received_raw_src" +clonefs="$POOL/xdr_redacted_received_raw_clone" +midfs="$POOL2/xdr_redacted_received_raw_mid" +recvfs="$POOL2/xdr_redacted_received_raw_recv" +keyfile="/$POOL/xdr_redacted_received_raw.key" +full_stream="/$POOL/xdr_redacted_received_raw_full.zsend" +resend_stream="/$POOL/xdr_redacted_received_raw_resend.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $midfs && destroy_dataset $midfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $keyfile $full_stream $resend_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a raw send of a received-redacted dataset is " \ + "XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=8 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Redacted send (non-raw) into a receiver that establishes its own encryption. +log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream" +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must eval "zfs receive -o encryption=on -o keyformat=passphrase " \ + "-o keylocation=file://$keyfile $midfs < $full_stream" + +# Re-send the received stream as a raw (encrypted) stream. The DRR_BEGIN +# nvlist now carries both BEGINNV_REDACT_SNAPS data and crypt_keydata +# (DMU_BACKUP_FEATURE_RAW). +log_must eval "zfs send -w $midfs@s0 > $resend_stream" +verify_xdr_nvlist_encoding $resend_stream +log_must eval "zfs receive $recvfs < $resend_stream" + +log_pass "BEGIN nvlist of a raw send of a received-redacted dataset is " \ + "XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh new file mode 100755 index 00000000000..22d0bf20410 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_replication.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A replication send (zfs send -R) may emit two distinct categories of +# DRR_BEGIN record: +# +# 1. A wrapper BEGIN of type DMU_COMPOUNDSTREAM, generated in libzfs +# (lib/libzfs/libzfs_sendrecv.c), whose nvlist describes the package +# stream. This BEGIN has always been XDR-encoded and is not affected +# by the kernel-side encoding changes introduced in PR #18372. +# +# 2. One inner BEGIN record per dataset whose contents are included, +# generated in the kernel (module/zfs/dmu_send.c). These are the BEGIN +# records whose encoding the kernel-side change consolidates to XDR. +# +# All other tests in this suite exercise category (2). This test exercises +# both categories together: it verifies that no BEGIN record produced +# anywhere on the userspace+kernel send path is encoded with NV_ENCODE_NATIVE, +# so a future regression in either layer would be caught. +# +# Strategy: +# 1. Create an unencrypted parent dataset and an encrypted child filesystem +# underneath it, with some data in each. The encrypted child is what +# causes the kernel-side inner BEGIN to actually carry an nvlist payload +# (crypt_keydata) rather than passing through silently. +# 2. Snapshot recursively. +# 3. zfs send -wR parent@snap to a file. The resulting stream contains a +# libzfs-generated wrapper BEGIN with its compound-stream nvlist plus +# one kernel-generated inner BEGIN per dataset; the child's inner BEGIN +# carries crypt_keydata. +# 4. Verify the encoding for the whole stream — this checks every BEGIN +# nvlist line that zstream dump emits, so it covers both the wrapper +# and the encrypted child's inner record. +# 5. Verify that the stream can be zfs received successfully. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_replication_src" +childfs="$POOL/xdr_replication_src/child" +recvfs="$POOL2/xdr_replication_recv" +keyfile="/$POOL/xdr_replication.key" +stream="/$POOL/xdr_replication.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $keyfile $stream +} +log_onexit cleanup + +log_assert "BEGIN nvlists in a recursive replication stream (wrapper and inner) are XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $childfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=4 status=none +log_must dd if=/dev/urandom of=/$childfs/f1 bs=128k count=4 status=none +log_must zfs snapshot -r $sendfs@s0 + +log_must eval "zfs send -wR $sendfs@s0 > $stream" +verify_xdr_nvlist_encoding $stream +log_must eval "zfs receive $recvfs < $stream" + +log_pass "BEGIN nvlists in a recursive replication stream (wrapper and inner) are XDR-encoded and receivable" + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh new file mode 100755 index 00000000000..e98de4c47f4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A token-resumed send (zfs send -t ) carries BEGINNV_RESUME_OBJECT +# and BEGINNV_RESUME_OFFSET in its DRR_BEGIN nvlist payload. Verify that +# this payload is XDR-encoded and that the resumed stream can be received. +# +# Strategy: +# 1. Create a small dataset with one snapshot. +# 2. zfs send the snapshot to a file, truncate it, then attempt receive +# so that a resume token is left behind. +# 3. zfs send -t to produce the resumed stream. +# 4. Verify that the resumed stream is XDR-encoded. +# 5. Verify that zfs receive -s on the resumed stream is successful. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_resume_src" +recvfs="$POOL2/xdr_resume_recv" +full_stream="/$POOL/xdr_resume_full.zsend" +resumed_stream="/$POOL/xdr_resume_resumed.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -r + datasetexists $recvfs && destroy_dataset $recvfs -r + rm -f $full_stream $resumed_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a token-resumed send is XDR-encoded and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=8 status=none +log_must zfs snapshot $sendfs@s1 + +log_must eval "zfs send $sendfs@s1 > $full_stream" +mess_send_file $full_stream +log_mustnot eval "zfs receive -s $recvfs < $full_stream" + +token=$(get_prop receive_resume_token $recvfs) +[[ -n "$token" && "$token" != "-" ]] || \ + log_fail "no resume token left behind by partial receive" +log_must eval "zfs send -t $token > $resumed_stream" + +verify_xdr_nvlist_encoding $resumed_stream +log_must eval "zfs receive -s $recvfs < $resumed_stream" + +log_pass "BEGIN nvlist of a token-resumed send is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh new file mode 100755 index 00000000000..6645315fcd7 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# The most populated DRR_BEGIN nvlist in the kernel: a token-resumed raw +# incremental from a redaction bookmark carries BEGINNV_REDACT_FROM_SNAPS, +# crypt_keydata, and BEGINNV_RESUME_{OBJECT,OFFSET}. Verify that this +# combined payload is XDR-encoded and the resumed stream can be received. +# +# Strategy: +# 1. Create an encrypted source with a redaction bookmark and a later +# snapshot, mirroring xdr_bookmark_raw. +# 2. Establish a raw base on the receiver. +# 3. zfs send -w -i sendfs#book sendfs@s1 to a file, truncate it, then +# attempt receive so that a resume token is left behind. +# 4. zfs send -t to produce the resumed stream. +# 5. Verify that the resumed stream is XDR-encoded. +# 6. Verify that zfs receive -s of the resumed stream is successful. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_resume_bookmark_raw_src" +clonefs="$POOL/xdr_resume_bookmark_raw_clone" +recvfs="$POOL2/xdr_resume_bookmark_raw_recv" +keyfile="/$POOL/xdr_resume_bookmark_raw.key" +full_stream="/$POOL/xdr_resume_bookmark_raw_full.zsend" +incr_stream="/$POOL/xdr_resume_bookmark_raw_incr.zsend" +resumed_stream="/$POOL/xdr_resume_bookmark_raw_resumed.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $keyfile $full_stream $incr_stream $resumed_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a token-resumed raw incremental from a redaction " \ + "bookmark is XDR-encoded and receivable" + +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $sendfs + +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=16 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=16 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Take @s1 with no intervening write. See xdr_resume_bookmark_raw_with_write.ksh +# for a variant that includes a post-redact write; that variant exercises +# a known kernel-side issue (#18491) and may flake. +log_must zfs snapshot $sendfs@s1 + +# Establish a raw base on the receiver. +log_must eval "zfs send -w $sendfs@s0 > $full_stream" +log_must eval "zfs receive $recvfs < $full_stream" + +# Truncate-and-resume on the raw incremental from the redaction bookmark. +log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \ + $incr_stream" +mess_send_file $incr_stream +log_mustnot eval "zfs receive -s $recvfs < $incr_stream" + +token=$(get_prop receive_resume_token $recvfs) +[[ -n "$token" && "$token" != "-" ]] || \ + log_fail "no resume token left behind by partial receive" +log_must eval "zfs send -t $token > $resumed_stream" + +verify_xdr_nvlist_encoding $resumed_stream +log_must eval "zfs receive -s $recvfs < $resumed_stream" + +log_pass "BEGIN nvlist of a token-resumed raw incremental from a redaction " \ + "bookmark is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh new file mode 100755 index 00000000000..6c0b6b5b4ec --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_bookmark_raw_with_write.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# This is the post-redact-write variant of xdr_resume_bookmark_raw, +# separated out because of a known issue (#18491) that causes it to fail +# roughly 30% of the time. It's included here as a test for issue #18491 +# until the exact source of that problem can be pinned down more specifically. +# +# Known issue: openzfs/zfs#18491 +# +# On a freshly-created pool, `zfs send -w -i ds#book ds@snap` intermittently +# fails with EACCES whenever there is data-modifying activity between the +# `zfs redact` that created the bookmark and the subsequent send. This EACCES +# is surfaced to userspace as the misleading message "dataset key must be +# loaded," although the key remains loaded throughout. +# +# The reproducer script included in the issue report typically triggers the +# problem within about 10 iterations on a fresh pool. Disk-sync mitigations +# (zpool sync, with or without `-f`, with or without sleep, single or doubled, +# applied at any reasonable point) do not avert the problem. CI runs that +# include the test in this file reproduce the failure regularly (though +# intermittently) across multiple distributions. xdr_resume_bookmark_raw.ksh +# removes the post-redact write (which is not essential to the test) and +# therefore runs reliably. +# +# When this test fails, the failure marker is the libzfs warning +# "dataset key must be loaded" on stderr from the first `zfs send -w -i` +# line below (the one that produces the stream we then truncate), and a +# non-zero exit from that send. The test does not attempt to distinguish +# the known-issue failure from other possible failures. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_resume_bookmark_raw_with_write_src" +clonefs="$POOL/xdr_resume_bookmark_raw_with_write_clone" +recvfs="$POOL2/xdr_resume_bookmark_raw_with_write_recv" +keyfile="/$POOL/xdr_resume_bookmark_raw_with_write.key" +full_stream="/$POOL/xdr_resume_bookmark_raw_with_write_full.zsend" +incr_stream="/$POOL/xdr_resume_bookmark_raw_with_write_incr.zsend" +resumed_stream="/$POOL/xdr_resume_bookmark_raw_with_write_resumed.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $keyfile $full_stream $incr_stream $resumed_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a token-resumed raw incremental from a redaction " \ + "bookmark, with a post-redact write, is XDR-encoded and receivable " \ + "(known to flake; see openzfs/zfs#18491)" + +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $sendfs + +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=16 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +# Post-redact write: the trigger for openzfs/zfs#18491. +log_must dd if=/dev/urandom of=/$sendfs/f3 bs=128k count=16 status=none +log_must zfs snapshot $sendfs@s1 + +# Establish a raw base on the receiver. +log_must eval "zfs send -w $sendfs@s0 > $full_stream" +log_must eval "zfs receive $recvfs < $full_stream" + +# The next line is what races. On failure it exits with EACCES rendered +# as "dataset key must be loaded". +log_must eval "zfs send -w -i $sendfs#redaction-bookmark $sendfs@s1 > \ + $incr_stream" +mess_send_file $incr_stream +log_mustnot eval "zfs receive -s $recvfs < $incr_stream" + +token=$(get_prop receive_resume_token $recvfs) +[[ -n "$token" && "$token" != "-" ]] || \ + log_fail "no resume token left behind by partial receive" +log_must eval "zfs send -t $token > $resumed_stream" + +verify_xdr_nvlist_encoding $resumed_stream +log_must eval "zfs receive -s $recvfs < $resumed_stream" + +log_pass "BEGIN nvlist of a token-resumed raw incremental from a redaction " \ + "bookmark, with a post-redact write, is XDR-encoded and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh new file mode 100755 index 00000000000..a96df10b945 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_raw.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A resumed raw send (zfs send -t for a raw stream of an encrypted +# dataset) carries both BEGINNV_RESUME_{OBJECT,OFFSET} and the "crypt_keydata" +# nested nvlist in its DRR_BEGIN nvlist payload. Verify that this combined +# payload is XDR-encoded and the resumed stream can be received. +# +# Strategy: +# 1. Create an encrypted dataset with one snapshot. +# 2. zfs send -w to a file, truncate it, then attempt to zfs receive the +# stream so that a resume token is left behind. +# 3. zfs send -t to produce the resumed raw stream. +# 4. Verify that the resumed stream is XDR-encoded. +# 5. Verify that zfs receive -s receives the resumed stream successfully. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_resume_raw_src" +recvfs="$POOL2/xdr_resume_raw_recv" +keyfile="/$POOL/xdr_resume_raw.key" +full_stream="/$POOL/xdr_resume_raw_full.zsend" +resumed_stream="/$POOL/xdr_resume_raw_resumed.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -r + datasetexists $recvfs && destroy_dataset $recvfs -r + rm -f $keyfile $full_stream $resumed_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a token-resumed raw send is XDR-encoded " \ + "and receivable" + +log_must eval "echo 'thisisapassphrase' > $keyfile" +log_must zfs create -o encryption=on -o keyformat=passphrase \ + -o keylocation=file://$keyfile $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none +log_must zfs snapshot $sendfs@s1 + +log_must eval "zfs send -w $sendfs@s1 > $full_stream" +mess_send_file $full_stream +log_mustnot eval "zfs receive -s $recvfs < $full_stream" + +token=$(get_prop receive_resume_token $recvfs) +[[ -n "$token" && "$token" != "-" ]] || \ + log_fail "no resume token left behind by partial receive" +log_must eval "zfs send -t $token > $resumed_stream" + +verify_xdr_nvlist_encoding $resumed_stream +log_must eval "zfs receive -s $recvfs < $resumed_stream" + +log_pass "BEGIN nvlist of a token-resumed raw send is XDR-encoded " \ + "and receivable" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh new file mode 100755 index 00000000000..6cee3e51a3d --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/send_xdr_encoding/xdr_resume_redacted.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Garth Snyder. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/tests/functional/send_xdr_encoding/send_xdr_encoding.kshlib + +# +# Description: +# A resumed redacted send (zfs send -t for a redacted stream) +# carries both BEGINNV_REDACT_SNAPS and BEGINNV_RESUME_{OBJECT,OFFSET} in +# its DRR_BEGIN nvlist payload. Verify that this combined payload is +# XDR-encoded and the resumed stream can be received. +# +# Strategy: +# 1. Create a source dataset with a redaction bookmark. +# 2. zfs send --redact sendfs@snap to a file, truncate it, then +# attempt zfs receive so that a resume token is left behind. +# 3. zfs send -t to produce a resumed redacted stream. +# 4. Verify that the resumed stream is XDR-encoded. +# 5. Verify that zfs receive -s of the resumed stream is successful. +# + +verify_runnable "both" + +sendfs="$POOL/xdr_resume_redacted_src" +clonefs="$POOL/xdr_resume_redacted_clone" +recvfs="$POOL2/xdr_resume_redacted_recv" +full_stream="/$POOL/xdr_resume_redacted_full.zsend" +resumed_stream="/$POOL/xdr_resume_redacted_resumed.zsend" + +function cleanup +{ + datasetexists $sendfs && destroy_dataset $sendfs -R + datasetexists $recvfs && destroy_dataset $recvfs -R + rm -f $full_stream $resumed_stream +} +log_onexit cleanup + +log_assert "BEGIN nvlist of a token-resumed redacted send is XDR-encoded " \ + "and receivable" + +log_must zfs create $sendfs +log_must dd if=/dev/urandom of=/$sendfs/f1 bs=128k count=16 status=none +log_must dd if=/dev/urandom of=/$sendfs/f2 bs=128k count=16 status=none +log_must zfs snapshot $sendfs@s0 + +log_must zfs clone $sendfs@s0 $clonefs +log_must dd if=/dev/urandom of=/$clonefs/f1 bs=128k count=16 conv=notrunc \ + status=none +log_must zfs snapshot $clonefs@s + +log_must zfs redact $sendfs@s0 redaction-bookmark $clonefs@s + +log_must eval "zfs send --redact redaction-bookmark $sendfs@s0 > $full_stream" +mess_send_file $full_stream +log_mustnot eval "zfs receive -s $recvfs < $full_stream" + +token=$(get_prop receive_resume_token $recvfs) +[[ -n "$token" && "$token" != "-" ]] || \ + log_fail "no resume token left behind by partial receive" +log_must eval "zfs send -t $token > $resumed_stream" + +verify_xdr_nvlist_encoding $resumed_stream +log_must eval "zfs receive -s $recvfs < $resumed_stream" + +log_pass "BEGIN nvlist of a token-resumed redacted send is XDR-encoded " \ + "and receivable" + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh index ffc4e96f5a0..0f3b1a84d83 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh @@ -64,7 +64,7 @@ function cleanup log_assert "Verify an archive of a file system is identical to " \ "an archive of its snapshot." -SNAPSHOT_TARDIR="$(mktemp -t -d zfstests_snapshot_002.XXXXXX)" +SNAPSHOT_TARDIR="$(mktemp -d "$TEST_BASE_DIR/zfstests_snapshot_002.XXXXXX")" log_onexit cleanup typeset -i COUNT=21 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh index 20d53eb5012..db8d820bdd1 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh @@ -73,7 +73,7 @@ function cleanup log_assert "Verify that an archive of a dataset is identical to " \ "an archive of the dataset's snapshot." -SNAPSHOT_TARDIR="$(mktemp -t -d zfstests_snapshot_006.XXXXXX)" +SNAPSHOT_TARDIR="$(mktemp -d "$TEST_BASE_DIR/zfstests_snapshot_006.XXXXXX")" log_onexit cleanup typeset -i COUNT=21 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh index ab749b5f793..ea10e492503 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/stat/statx_dioalign.ksh @@ -89,7 +89,8 @@ typeset -i PAGE_SIZE=$(getconf PAGE_SIZE) # Set recordsize to 128K, and make a 64K file (so only one block) for the # sizing tests below. log_must zfs set recordsize=128K $TESTDS -log_must dd if=/dev/urandom of=$TESTFILE bs=64k count=1 +log_must rm -f $TESTFILE +log_must stride_dd -i /dev/urandom -o $TESTFILE -b 65536 -c 1 log_must zpool sync # when DIO is disabled via tunable, statx will not return the dioalign result @@ -141,7 +142,7 @@ done # Now we extend the file into its second block. This effectively locks in its # block size, which will always be returned regardless of recordsize changes. log_must zfs set recordsize=128K $TESTDS -log_must dd if=/dev/urandom of=$TESTFILE bs=192K count=1 +log_must stride_dd -i /dev/urandom -o $TESTFILE -b 196608 -c 1 log_must zpool sync # Confirm that no matter how we change the recordsize, the alignment remains at @@ -167,14 +168,14 @@ log_must rm -f $TESTFILE log_must touch $TESTFILE log_must zpool sync assert_dioalign $TESTFILE $PAGE_SIZE 16384 -log_must dd if=/dev/urandom of=$TESTFILE bs=16384 count=16 oflag=direct +log_must stride_dd -i /dev/urandom -o $TESTFILE -b 16384 -c 16 -D # same again, but writing with incorrect alignment, which should fail. log_must rm -f $TESTFILE log_must touch $TESTFILE log_must zpool sync assert_dioalign $TESTFILE $PAGE_SIZE 16384 -log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct +log_mustnot stride_dd -i /dev/urandom -o $TESTFILE -b 1024 -c 256 -D # same again, but without strict, which should succeed. log_must set_tunable32 DIO_STRICT 0 @@ -182,6 +183,6 @@ log_must rm -f $TESTFILE log_must touch $TESTFILE log_must zpool sync assert_dioalign $TESTFILE $PAGE_SIZE 16384 -log_must dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct +log_must stride_dd -i /dev/urandom -o $TESTFILE -b 1024 -c 256 -D log_pass $CLAIM diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh new file mode 100755 index 00000000000..c5ad282eb8a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_008_pos.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026, TrueNAS. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that the 'rotational' vdev property is readable on spare and +# L2ARC vdevs, which have no per-vdev ZAP, and that its value persists +# across export/import when the spare device is absent. +# +# STRATEGY: +# 1. Create a pool with a mirror, a spare, and an L2ARC device. +# 2. Verify 'rotational' is readable on leaf, virtual (mirror), spare, +# and L2ARC vdevs. +# 3. Export the pool, remove the spare file, re-import, and verify that +# 'rotational' still reports the same value for the missing spare, +# proving the value comes from the persisted config. +# + +verify_runnable "global" + +SPARE="$TEST_BASE_DIR/vz008-spare" +L2C="$TEST_BASE_DIR/vz008-l2c" +VDEV1="$TEST_BASE_DIR/vz008-vdev1" +VDEV2="$TEST_BASE_DIR/vz008-vdev2" + +function cleanup +{ + destroy_pool $TESTPOOL + rm -f $VDEV1 $VDEV2 $SPARE $L2C +} + +log_assert "'rotational' is readable on ZAP-less vdevs and persists absent" +log_onexit cleanup + +log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 $SPARE $L2C + +log_must zpool create -f $TESTPOOL \ + mirror $VDEV1 $VDEV2 \ + cache $L2C \ + spare $SPARE + +# Leaf vdev should report rotational. +NR=$(zpool get -H -o value rotational $TESTPOOL $VDEV1) +[[ "$NR" == "on" || "$NR" == "off" ]] || + log_fail "leaf $VDEV1: expected on/off, got '$NR'" + +# Virtual (mirror) vdev should report rotational. +MIRROR=$(zpool list -v -H $TESTPOOL | awk '$1 ~ /^mirror/ {print $1; exit}') +NR=$(zpool get -H -o value rotational $TESTPOOL "$MIRROR") +[[ "$NR" == "on" || "$NR" == "off" ]] || + log_fail "mirror: expected on/off, got '$NR'" + +# Spare vdev should report rotational even though it has no ZAP. +NR=$(zpool get -H -o value rotational $TESTPOOL $SPARE) +[[ "$NR" == "on" || "$NR" == "off" ]] || + log_fail "spare $SPARE: expected on/off, got '$NR'" + +# L2ARC vdev should report rotational even though it has no ZAP. +NR=$(zpool get -H -o value rotational $TESTPOOL $L2C) +[[ "$NR" == "on" || "$NR" == "off" ]] || + log_fail "L2ARC $L2C: expected on/off, got '$NR'" + +# The value must persist across export/import when the spare is absent. +# Remove the spare file before re-import so that vdev_open() cannot read +# the hardware value and the only source is the persisted config. +NR_BEFORE=$(zpool get -H -o value rotational $TESTPOOL $SPARE) +log_must zpool export $TESTPOOL +log_must rm -f $SPARE +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL +NR_AFTER=$(zpool get -H -o value rotational $TESTPOOL $SPARE) +[[ "$NR_BEFORE" == "$NR_AFTER" ]] || + log_fail "spare rotational changed across import: $NR_BEFORE -> $NR_AFTER" + +log_pass "'rotational' readable on spare/L2ARC vdevs and persists when absent" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh index 9047f14bc81..0f18f2e5733 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -58,7 +58,7 @@ biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL) typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \ $num_zvols ))) -typeset tmpdir="$(mktemp -t -d zvol_stress_fio_state.XXXXXX)" +typeset tmpdir="$(mktemp -d "$TEST_BASE_DIR/zvol_stress_fio_state.XXXXXX")" log_must save_tunable VOL_USE_BLK_MQ log_must save_tunable VOL_REQUEST_SYNC diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile index 8fd023005b5..44a63977f28 100644 --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -323,6 +323,8 @@ SRCS+= abd.c \ vdev_root.c \ vdev_trim.c \ zap.c \ + zap_fat.c \ + zap_impl.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ @@ -439,6 +441,7 @@ CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.zap_leaf.c= -Wno-cast-qual CFLAGS.zap_micro.c= -Wno-cast-qual +CFLAGS.zap_impl.c= -Wno-cast-qual CFLAGS.zcp.c= -Wno-cast-qual CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index b4e4c1e4d29..2086bd330e1 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -258,6 +258,9 @@ /* fs_context exists */ /* #undef HAVE_FS_CONTEXT */ +/* fs_parse() takes fs_parameter_spec directly */ +/* #undef HAVE_FS_PARSE_TAKES_SPEC */ + /* yes */ /* #undef HAVE_GENERIC_FADVISE */ @@ -579,6 +582,12 @@ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ +/* Define if you have POSIX threads libraries and header files. */ +#define HAVE_PTHREAD 1 + +/* Have PTHREAD_PRIO_INHERIT. */ +#define HAVE_PTHREAD_PRIO_INHERIT 1 + /* If available, contains the Python version number currently in use. */ /* #undef HAVE_PYTHON */ @@ -881,6 +890,10 @@ /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ +/* Define to necessary symbol if this constant uses a non-standard name on + your system. */ +/* #undef PTHREAD_CREATE_JOINABLE */ + /* The size of 'off_t', as computed by sizeof. */ /* #undef SIZEOF_OFF_T */ @@ -914,7 +927,7 @@ /* #undef ZFS_DEVICE_MINOR */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.4.99-566-FreeBSD_ga12c6ed62" +#define ZFS_META_ALIAS "zfs-2.4.99-695-FreeBSD_ga170134fe" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -944,7 +957,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "566-FreeBSD_ga12c6ed62" +#define ZFS_META_RELEASE "695-FreeBSD_ga170134fe" /* Define the project version. */ #define ZFS_META_VERSION "2.4.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 113c3d7a5bb..f783cf01c72 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.4.99-566-ga12c6ed62-dirty" +#define ZFS_META_GITREV "zfs-2.4.99-695-ga170134fe"