Notable upstream pull request merges:
 #18372 eaaea55b6 Consistently encode DRR_BEGIN packed nvlist payloads with
                  NV_ENCODE_XDR
 #18410 891e379d0 Fix failfast default and usage
 #18470 a2d053329 zdb: Add some more file layout output, triggered by -v
 #18472 d50f5b6d0 dsl_dir: avoid dd_lock during snapshots_changed updates
 #18493 d65015938 Vdev allocation bias/class change
 #18497 8fdc86675 zfs: annotate nested dd_lock in reservation sync
                  accounting
 #18494 956deba27 zdb: detect BRT and DDT leaks during block traversal
 #18499 c7cfe0805 zarcstat: detect attached L2ARC device with no data
 #18503 439b802e7 sa: fix sa_add_projid lock ordering
 #18508 968f4db03 zpool-attach.8: add EXAMPLES section
 #18513 45dddc452 zfs.4: Fix documentation of zfs_arc_dnode_reduce_percent
 #18516 8ff64005a zap: split implementation out into more files
 #18520 181e1b522 Fix double free for blocks cloned after DDT prune
 #18535 -multiple zstream: fix crashes when refcount tracking enabled
 #18536 -multiple refcount tag fixups
 #18541 a65ed7afd zpool/zfs: accept --help and -? after a subcommand
 #18544 6fb72fda0 zio_ddt_write: compute have_dvas after taking dde_io_lock
 #18546 -multiple zap: internal locking uplift
 #18550 40a87651d zap_impl: use flex array field for mzap_phys_t.mz_chunks
 #18551 -multiple zap: make the _by_dnode() op variants be the primary
                  implementation
 #18570 112b0131b zpl_xattr: stop heap-allocating prefixed xattr names
 #18578 4bc8c39b6 zed: Prefer dRAID distributed spares to regular ones
 #18596 e30ab5fa4 FreeBSD: Make it possible to build openzfs.ko with
                  sanitizers
 #18597 472ddca11 zed: Prefer spares with matching rotational and size
 #18599 c90dc2808 enforce exact decompressed length for lz4, gzip, and zstd
 #18603 -multiple zap: add zap_cursor_init_by_dnode; cursor unit tests;
                  mock dnode refcounts
 #18604 59dc88602 nvpair: Check for un-terminated strings in packed nvlist
 #18606 ef6f26145 When reading a vdev label skip libzfs_core_init()
 #18613 0aa4088dc sharenfs: Check for invalid characters
 #18615 80fb85b80 Fix the integer type in zfs_ioc_userspace_many()
 #18616 e199f6d98 Fix uninitialized variable warning in vdev_prop_get()
 #18617 7de42602c Extend dataset zfs_ioc_set_prop() secpolicy
 #18622 5fea0c838 Parallelize metaslab_sync_done() calls
 #18623 cab50d5ad Add additional verification of size fields and strings
 #18630 -multiple zap: misc function removal / uplift / tests
 #18633 a8ef128da Fix uninitialized variable warning in zil_parse()

Obtained from:	OpenZFS
OpenZFS commit:	a170134feb
This commit is contained in:
Martin Matuska
2026-06-06 22:48:32 +02:00
196 changed files with 14745 additions and 4032 deletions
+2
View File
@@ -163,6 +163,8 @@ KERNEL_C = \
vdev_root.c \
vdev_trim.c \
zap.c \
zap_fat.c \
zap_impl.c \
zap_leaf.c \
zap_micro.c \
zcp.c \
+2
View File
@@ -346,6 +346,8 @@ contrib/openzfs/module/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_trim.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/zap.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/zap_fat.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/zap_impl.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/zcp.c optional zfs compile-with "${ZFS_C}"
+82 -47
View File
@@ -1,61 +1,96 @@
## The testings are done this way
## CI overview
The main test pipeline is `zfs-qemu.yml`. Code checking and other
workflows run independently alongside it.
```mermaid
flowchart TB
subgraph CleanUp and Summary
CleanUp+Summary
subgraph Functional testing
Setup[test-config: pick ci_type + OS matrix]
Setup --> almalinux
Setup --> centos[centos-stream]
Setup --> debian
Setup --> fedora
Setup --> ubuntu
Setup --> freebsd
almalinux --> Cleanup[cleanup + summary]
centos --> Cleanup
debian --> Cleanup
fedora --> Cleanup
ubuntu --> Cleanup
freebsd --> Cleanup
end
subgraph Functional Testings
sanity-checks-20.04
zloop-checks-20.04
functional-testing-20.04-->Part1-20.04
functional-testing-20.04-->Part2-20.04
functional-testing-20.04-->Part3-20.04
functional-testing-20.04-->Part4-20.04
functional-testing-22.04-->Part1-22.04
functional-testing-22.04-->Part2-22.04
functional-testing-22.04-->Part3-22.04
functional-testing-22.04-->Part4-22.04
sanity-checks-22.04
zloop-checks-22.04
end
subgraph Code Checking + Building
Build-Ubuntu-20.04
subgraph Code checking
checkstyle.yaml
codeql.yml
checkstyle.yml
Build-Ubuntu-22.04
smatch.yml
end
Build-Ubuntu-20.04-->sanity-checks-20.04
Build-Ubuntu-20.04-->zloop-checks-20.04
Build-Ubuntu-20.04-->functional-testing-20.04
Build-Ubuntu-22.04-->sanity-checks-22.04
Build-Ubuntu-22.04-->zloop-checks-22.04
Build-Ubuntu-22.04-->functional-testing-22.04
sanity-checks-20.04-->CleanUp+Summary
Part1-20.04-->CleanUp+Summary
Part2-20.04-->CleanUp+Summary
Part3-20.04-->CleanUp+Summary
Part4-20.04-->CleanUp+Summary
Part1-22.04-->CleanUp+Summary
Part2-22.04-->CleanUp+Summary
Part3-22.04-->CleanUp+Summary
Part4-22.04-->CleanUp+Summary
sanity-checks-22.04-->CleanUp+Summary
subgraph Other workflows
zfs-arm.yml
zloop.yml
labels.yml
end
```
Every `qemu-vm` matrix entry runs on a fixed `ubuntu-24.04` host.
The steps inside one entry are:
1) build zfs modules for Ubuntu 20.04 and 22.04 (~15m)
2) 2x zloop test (~10m) + 2x sanity test (~25m)
3) 4x functional testings in parts 1..4 (each ~1h)
4) cleanup and create summary
- content of summary depends on the results of the steps
1) set up QEMU and boot the guest (~2-4m)
2) install build dependencies in the guest (~2-4m)
3) build zfs modules in the guest (~8-12m)
4) run functional tests (~2-4h)
5) package and upload per-OS test logs (~10s)
When everything runs fine, the full run should be done in
about 2 hours.
A per-OS entry takes about 3 to 4 hours. Once all entries finish, the
`cleanup` job aggregates the results into a summary.
The codeql.yml and checkstyle.yml are not part in this circle.
### `ci_type` selection
`test-config` runs `.github/workflows/scripts/generate-ci-type.py` against
the PR's changed files and picks one of:
| `ci_type` | OS matrix |
|-----------|--------------------------------------------|
| `docs` | empty (documentation-only PRs) |
| `quick` | 6 Linux + 1 FreeBSD |
| `linux` | all supported Linux distros |
| `freebsd` | all supported FreeBSD versions |
| default | cross-platform sample |
Pushes to `openzfs/zfs` skip the matrix entirely; only PRs (and pushes to
forks) build.
Authors can force a specific ci_type by adding `ZFS-CI-Type: <type>` to
the most recent commit message. The `ZTS_OS_OVERRIDE` repository variable
can also alter the selection. The `workflow_dispatch` trigger accepts
`fedora_kernel_ver` (Fedora-only run with a chosen kernel) and
`specific_os` (pin the matrix to one OS).
### Supported guests
Auto-selected:
- Linux: almalinux 8/9/10, centos-stream 9/10, debian 11/12/13,
fedora 43/44, ubuntu 22/24/26
- FreeBSD: 14.4-RELEASE/STABLE, 15.0-RELEASE, 15.1-STABLE, 16.0-CURRENT
Available via `specific_os` or `ZTS_OS_OVERRIDE`:
- archlinux, tumbleweed
### Code checking
- `checkstyle.yaml`: source-style checks
- `codeql.yml`: CodeQL analysis
- `smatch.yml`: smatch analysis
### Other workflows
- `zfs-arm.yml`: ARM build on `ubuntu-24.04-arm`
- `zloop.yml`: host-side zloop
- `labels.yml`: maintains PR status labels
- `zfs-qemu-packages.yml`: manually dispatched, builds release RPMs or
tests RPM installation from the ZFS yum repo
+1 -1
View File
@@ -12,7 +12,7 @@ jobs:
checkstyle:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
with:
ref: ${{ github.event.pull_request.head.sha }}
- name: Install dependencies
+4 -4
View File
@@ -11,7 +11,7 @@ concurrency:
jobs:
analyze:
name: Analyze
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
permissions:
actions: read
contents: read
@@ -31,15 +31,15 @@ jobs:
uses: actions/checkout@v6
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
uses: github/codeql-action/init@v4
with:
config-file: .github/codeql-${{ matrix.language }}.yml
languages: ${{ matrix.language }}
- name: Autobuild
uses: github/codeql-action/autobuild@v3
uses: github/codeql-action/autobuild@v4
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
uses: github/codeql-action/analyze@v4
with:
category: "/language:${{matrix.language}}"
@@ -6,6 +6,9 @@
Output format: "<type> <source>" where source is "manual" (from
ZFS-CI-Type commit tag) or "auto" (from file change heuristics).
Prints "docs auto" if every changed file is documentation; the qemu
matrix is skipped in that case.
Prints "quick manual" if:
- the *last* commit message contains 'ZFS-CI-Type: quick'
or "quick auto" if (heuristics):
@@ -28,10 +31,24 @@
r'.*\.gitignore'
]))
"""
Patterns of files that are documentation only.
"""
DOCS_ONLY_REGEX = list(map(re.compile, [
r'man/.*',
r'.*\.md',
r'AUTHORS',
r'COPYRIGHT',
r'LICENSE',
r'NOTICE',
r'\.gitignore',
]))
"""
Patterns of files that are considered to trigger full CI.
"""
FULL_RUN_REGEX = list(map(re.compile, [
r'\.github/workflows/.*\.ya?ml',
r'\.github/workflows/scripts/.*',
r'cmd.*',
r'configs/.*',
@@ -116,6 +133,12 @@ def output_type(type, source, reason):
f'changed file "{f}" matches pattern "{r.pattern}"'
)
if changed_files and all(
any(r.match(f) for r in DOCS_ONLY_REGEX)
for f in changed_files):
output_type('docs', 'auto',
'all changed files are documentation')
# catch-all
output_type('quick', 'auto',
'no changed file matches full CI patterns')
@@ -17,6 +17,8 @@ sudo docker builder prune -a
unneeded="microsoft-edge-stable|azure-cli|google-cloud|google-chrome-stable|"\
"temurin|llvm|firefox|mysql-server|snapd|android|dotnet|haskell|ghcup|"\
"powershell|julia|swift|miniconda|chromium"
# refresh package index before removing packages
sudo apt-get -y update
sudo apt-get -y remove $(dpkg-query -f '${binary:Package}\n' -W | grep -E "'$unneeded'")
sudo apt-get -y autoremove
+29 -24
View File
@@ -28,6 +28,7 @@ NIC="virtio"
# additional options for virt-install
OPTS[0]=""
OPTS[1]=""
ALT_URL=""
case "$OS" in
almalinux8)
@@ -56,11 +57,22 @@ case "$OS" in
centos-stream9)
OSNAME="CentOS Stream 9"
URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2"
# Sometimes we get HTTP errors for the first link. Fall back to the
# "Composes" repo as an alternative. The "Composes" repo includes
# autogenerated nightly CentOS Stream images. We have to lookup the URL
# dynamically since the qcow2 file name has the date in it.
ALT_URL=$(wget --accept "CentOS-Stream-GenericCloud-9-*.x86_64.qcow2" --spider -np --recursive --no-verbose \
https://composes.stream.centos.org/stream-9/production/latest-CentOS-Stream/compose/BaseOS/x86_64/images/ 2>&1 | \
awk '/200 OK/{print $(NF-2)}')
;;
centos-stream10)
OSNAME="CentOS Stream 10"
OSv="centos-stream9"
URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2"
ALT_URL=$(wget --accept "CentOS-Stream-GenericCloud-10-*.x86_64.qcow2" --spider -np --recursive --no-verbose \
https://composes.stream.centos.org/stream-10/production/latest-CentOS-Stream/compose/BaseOS/x86_64/images/ 2>&1 | \
awk '/200 OK/{print $(NF-2)}')
;;
debian11)
OSNAME="Debian 11"
@@ -78,11 +90,6 @@ case "$OS" in
OPTS[0]="--boot"
OPTS[1]="uefi=on"
;;
fedora42)
OSNAME="Fedora 42"
OSv="fedora-unknown"
URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2"
;;
fedora43)
OSNAME="Fedora 43"
OSv="fedora-unknown"
@@ -93,14 +100,6 @@ case "$OS" in
OSv="fedora-unknown"
URL="https://download.fedoraproject.org/pub/fedora/linux/releases/44/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-44-1.7.x86_64.qcow2"
;;
freebsd13-5r)
FreeBSD="13.5-RELEASE"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd13.0"
URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
NIC="rtl8139"
;;
freebsd14-4r)
FreeBSD="14.4-RELEASE"
OSNAME="FreeBSD $FreeBSD"
@@ -111,18 +110,10 @@ case "$OS" in
freebsd15-0r)
FreeBSD="15.0-RELEASE"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd15.0"
OSv="freebsd14.0"
URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
;;
freebsd13-5s)
FreeBSD="13.5-STABLE"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd13.0"
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
NIC="rtl8139"
;;
freebsd14-4s)
FreeBSD="14.4-STABLE"
OSNAME="FreeBSD $FreeBSD"
@@ -131,7 +122,7 @@ case "$OS" in
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
;;
freebsd15-1s)
FreeBSD="15.1-PRERELEASE"
FreeBSD="15.1-STABLE"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd14.0"
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
@@ -160,6 +151,11 @@ case "$OS" in
OSv="ubuntu24.04"
URL="$UBMIRROR/noble/current/noble-server-cloudimg-amd64.img"
;;
ubuntu26)
OSNAME="Ubuntu 26.04"
OSv="ubuntu24.04"
URL="$UBMIRROR/resolute/current/resolute-server-cloudimg-amd64.img"
;;
*)
echo "Wrong value for OS variable!"
exit 111
@@ -173,7 +169,6 @@ echo "ENV=$ENV" >> $ENV
# result path
echo 'RESPATH="/var/tmp/test_results"' >> $ENV
# FreeBSD 13 has problems with: e1000 and virtio
echo "NIC=$NIC" >> $ENV
# freebsd15 -> used in zfs-qemu.yml
@@ -221,6 +216,16 @@ for cmd in 'axel -q -o' 'curl --fail -LSs -o' ; do
if [ -s "$IMG" ] ; then
# Successful download
break
else
if [ -n "$ALT_URL" ] ; then
# Try the $ALT_URL if specified
echo "Loading alternative $ALT_URL with $cmd..."
time eval "$cmd $IMG $ALT_URL"
if [ -s "$IMG" ]; then
# Successful ALT_URL download
break
fi
fi
fi
done
@@ -215,7 +215,7 @@ case "$1" in
tumbleweed)
tumbleweed
;;
ubuntu*)
ubuntu22|ubuntu24)
debian
echo "##[group]Install Ubuntu specific"
sudo apt-get install -yq linux-tools-common libtirpc-dev \
@@ -226,6 +226,27 @@ case "$1" in
# https://github.com/actions/runner-images/issues/9946
sudo apt-get install -yq build-essential
echo "##[endgroup]"
echo "##[group]Delete Ubuntu OpenZFS modules"
for i in $(find /lib/modules -name zfs -type d); do sudo rm -rvf $i; done
echo "##[endgroup]"
;;
ubuntu26)
debian
echo "##[group]Install Ubuntu specific"
# Skip linux-modules-extra which is already installed
sudo apt-get install -yq linux-tools-common
sudo apt-get install -yq libtirpc-dev
sudo apt-get install -yq dh-sequence-dkms
# Need 'build-essential' explicitly for ARM builder
# https://github.com/actions/runner-images/issues/9946
sudo apt-get install -yq build-essential
# Replace sudo-rs with sudo for now because the Rust version
# does not support -E to preserve the entire environment
sudo update-alternatives --set sudo /usr/bin/sudo.ws
echo "##[endgroup]"
echo "##[group]Delete Ubuntu OpenZFS modules"
for i in $(find /lib/modules -name zfs -type d); do sudo rm -rvf $i; done
@@ -267,8 +288,19 @@ case "$1" in
;;
debian*|ubuntu*)
sudo -E systemctl enable nfs-kernel-server
sudo -E systemctl enable qemu-guest-agent
sudo -E systemctl enable smbd
# enable usershares (disabled by default on ubuntu 26.04)
sudo -E sed -i '/usershare max shares/s/^#//' /etc/samba/smb.conf
# add systemd drop-in to allow the service to be enabled
sudo -E mkdir -p /etc/systemd/system/qemu-guest-agent.service.d/
sudo -E tee /etc/systemd/system/qemu-guest-agent.service.d/override.conf <<EOF
[Install]
WantedBy=multi-user.target
EOF
sudo -E systemctl daemon-reload
sudo -E systemctl enable qemu-guest-agent
;;
*)
# All other linux distros
@@ -292,7 +324,7 @@ case "$1" in
echo 'GRUB_SERIAL_COMMAND="serial --speed=115200"' \
| sudo tee -a /etc/default/grub >/dev/null
;;
ubuntu24)
ubuntu24|ubuntu26)
GRUB_CFG="/boot/grub/grub.cfg"
GRUB_MKCONFIG="grub-mkconfig"
echo 'GRUB_DISABLE_OS_PROBER="false"' \
@@ -2,9 +2,12 @@
# 3) Wait for VM to boot from previous step and launch dependencies
# script on it.
#
# $1: OS name (like 'fedora41')
# $2: (optional) Experimental kernel version to install on fedora,
# like "6.14".
# qemu-3-deps.sh [--poweroff] OS_NAME [FEDORA_VERSION]
#
# --poweroff: Power off the VM after installing dependencies
# OS_NAME: OS name (like 'fedora41')
# FEDORA_VERSION: (optional) Experimental Fedora kernel version, like "6.14" to
# install instead of Fedora defaults.
######################################################################
.github/workflows/scripts/qemu-wait-for-vm.sh vm0
@@ -15,8 +18,13 @@
# we need to update the kernel version in zfs's META file to allow the
# build to happen. We update our local copy of META here, since we know
# it will be rsync'd up in the next step.
if [ -n "${2:-}" ] ; then
sed -i -E 's/Linux-Maximum: .+/Linux-Maximum: 99.99/g' META
#
# Look to see if the last argument looks like a kernel version.
ver="${@: -1}"
if [[ $ver =~ ^[0-9]+\.[0-9]+ ]] ; then
# We got a kernel version, update META to say we support it so we
# can test against it.
sed -i -E 's/Linux-Maximum: .+/Linux-Maximum: '$ver'/g' META
fi
scp .github/workflows/scripts/qemu-3-deps-vm.sh zfs@vm0:qemu-3-deps-vm.sh
@@ -5,10 +5,12 @@
#
# Usage:
#
# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM]
# [--poweroff][--release][--repo][--tarball]
# qemu-4-build-vm.sh OS [--custom-branch BRANCH][--enable-debug][--dkms]
# [--patch-level NUM][--poweroff][--release][--repo][--tarball]
#
# OS: OS name like 'fedora41'
# --custom-branch: When building packages, checkout this version of ZFS to
# build, but use the current CI scripts to do it.
# --enable-debug: Build RPMs with '--enable-debug' (for testing)
# --dkms: Build DKMS RPMs as well
# --patch-level NUM: Use a custom patch level number for packages.
@@ -27,8 +29,27 @@ POWEROFF=""
RELEASE=""
REPO=""
TARBALL=""
CUSTOM_BRANCH=""
PREV_BRANCH=""
cleanup() {
if [ -n "$PREV_BRANCH" ] ; then
git checkout $PREV_BRANCH
fi
}
while [[ $# -gt 0 ]]; do
case $1 in
--custom-branch)
CUSTOM_BRANCH="$2"
# If the user specifies a custom tag/branch to build, and the build
# fails, we want to make sure our workflow scripts are restored to the
# current (more modern) versions so the subsequent CI steps use those.
shift
shift
PREV_BRANCH=$(git branch --show-current)
trap 'cleanup' ERR
;;
--enable-debug)
ENABLE_DEBUG=1
shift
@@ -337,7 +358,7 @@ fi
#
# rhel8.10
# almalinux9.5
# fedora42
# fedora44
source /etc/os-release
if which hostnamectl &> /dev/null ; then
# Fedora 42+ use hostnamectl
@@ -367,6 +388,11 @@ if [ -n "$ENABLE_DEBUG" ] ; then
extra="--enable-debug"
fi
if [ -n "$CUSTOM_BRANCH" ] ; then
git fetch --unshallow
git checkout $CUSTOM_BRANCH
fi
# build
case "$OS" in
freebsd*)
@@ -393,6 +419,8 @@ case "$OS" in
;;
esac
git checkout $PREV_BRANCH
PREV_BRANCH=""
# building the zfs module was ok
echo 0 > /var/tmp/build-exitcode.txt
@@ -25,8 +25,14 @@ cd lustre-release
# Include Lustre patches to build against master/zfs-2.4.x. Once these
# patches are merged we can remove these lines.
#
# LU-19539 osd-zfs: use osd_dmu_write() wrapper for xattrs
# LU-19761 osd-zfs: Build against ZFS 2.4.0
# LU-19249 build: Compatibility updates for kernel v6.16
#
patches=('https://review.whamcloud.com/changes/fs%2Flustre-release~62101/revisions/2/patch?download'
'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download')
'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download'
'https://review.whamcloud.com/changes/fs%2Flustre-release~60619/revisions/13/patch?download')
for p in "${patches[@]}" ; do
curl $p | base64 -d > patch
@@ -79,6 +79,7 @@ function do_builtin_build() {
cd $HOME/linux-$fullver
./scripts/config --enable ZFS
./scripts/config --enable ZFS_DEBUG
yes "" | make oldconfig
make -j `nproc`
) &> /var/tmp/builtin.txt || rc=$?
@@ -185,6 +186,13 @@ case "$OS" in
sudo mount -o noatime /dev/vdb /var/tmp
sudo chmod 1777 /var/tmp
sudo mv -f /tmp/*.txt /var/tmp
# Allow for longer RCU timeouts due to the heavily virtualized and
# potentially oversubscribed nature of the CI environment.
rcu_cpu_stall_timeout="/sys/module/rcupdate/parameters/rcu_cpu_stall_timeout"
if test -f $rcu_cpu_stall_timeout; then
echo 120 | sudo sh -c "cat > '$rcu_cpu_stall_timeout'"
fi
;;
esac
+9
View File
@@ -3,6 +3,14 @@ name: smatch
on:
push:
pull_request:
paths-ignore:
- 'man/**'
- '**.md'
- 'AUTHORS'
- 'COPYRIGHT'
- 'LICENSE'
- 'NOTICE'
- '.gitignore'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -10,6 +18,7 @@ concurrency:
jobs:
smatch:
if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
runs-on: ubuntu-24.04
steps:
- name: Checkout smatch
+50 -1
View File
@@ -3,11 +3,30 @@ name: zfs-arm
on:
push:
pull_request:
paths-ignore:
- 'man/**'
- '**.md'
- 'AUTHORS'
- 'COPYRIGHT'
- 'LICENSE'
- 'NOTICE'
- '.gitignore'
workflow_dispatch:
inputs:
gcc_ver:
type: string
required: false
default: ""
description: "(optional) install specific GCC version, like '16'"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
zfs-arm:
name: ZFS ARM build
if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
runs-on: ubuntu-24.04-arm
steps:
- uses: actions/checkout@v6
@@ -18,6 +37,31 @@ jobs:
timeout-minutes: 20
run: |
sudo apt-get -y remove firefox || true
# Do we want to test with a custom GCC version?
if [ "${{ github.event.inputs.gcc_ver }}" != "" ] ; then
ver="${{ github.event.inputs.gcc_ver }}"
sudo add-apt-repository ppa:ubuntu-toolchain-r/test
sudo apt-get update
echo "GCCs available:"
awk '/Package: gcc-/{print $2}' /var/lib/apt/lists/*ubuntu-toolchain-r*Packages
sudo apt-get -y install gcc g++ gcc-$ver g++-$ver
sudo update-alternatives --remove-all gcc || true 2>&1
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-$ver 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-$ver 100
sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100
sudo update-alternatives --set cc /usr/bin/gcc
sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
sudo update-alternatives --set c++ /usr/bin/g++
sudo update-alternatives --set gcc "/usr/bin/gcc-$ver"
sudo update-alternatives --set g++ "/usr/bin/g++-$ver"
fi
.github/workflows/scripts/qemu-3-deps-vm.sh ubuntu24
# We're running the VM scripts locally on the runner, so need to fix
@@ -28,7 +72,12 @@ jobs:
- name: Build modules
timeout-minutes: 30
run: |
.github/workflows/scripts/qemu-4-build-vm.sh --enable-debug ubuntu24
# Even though we may have installed a newer GCC, the kernel builds don't
# seem to honor it, and instead use the older GCC. I assume this is
# to match up with whatever GCC version was used for the kernel. Always
# specify KERNEL_CC to get around this. This works when using the
# default GCC and with a custom GCC.
KERNEL_CC=/usr/bin/gcc .github/workflows/scripts/qemu-4-build-vm.sh --enable-debug ubuntu24
# Quick sanity test since we're not running the full ZTS
sudo modprobe zfs
+13 -1
View File
@@ -42,6 +42,11 @@ on:
required: false
default: ""
description: "(optional) repo URL (blank: use http://download.zfsonlinux.org)"
custom_branch:
type: string
required: false
default: ""
description: "(optional) custom tag/branch to build using current CI (like 'zfs-2.2.9')"
lookup:
type: boolean
required: false
@@ -58,7 +63,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora42', 'fedora43', 'fedora44']
os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora43', 'fedora44']
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v6
@@ -94,9 +99,16 @@ jobs:
if [ -n "${{ github.event.inputs.patch_level }}" ] ; then
EXTRA="--patch-level ${{ github.event.inputs.patch_level }}"
fi
if [ -n "${{ github.event.inputs.custom_branch }}" ] ; then
EXTRA+=" --custom-branch ${{ github.event.inputs.custom_branch }}"
fi
.github/workflows/scripts/qemu-4-build.sh $EXTRA \
--repo --release --dkms --tarball ${{ matrix.os }}
if [ -n "${{ github.event.inputs.custom_branch }}" ] ; then
echo "Built packages for ${{ github.event.inputs.custom_branch }}"
fi
fi
- name: Prepare artifacts
+21 -10
View File
@@ -14,7 +14,7 @@ on:
type: string
required: false
default: ""
description: "(optional) Only run on this specific OS (like 'fedora42' or 'alpine3-23')"
description: "(optional) Only run on this specific OS (like 'fedora44' or 'alpine3-23')"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -23,6 +23,7 @@ concurrency:
jobs:
test-config:
name: Setup
if: github.event_name == 'pull_request' || github.repository != 'openzfs/zfs'
runs-on: ubuntu-24.04
outputs:
test_os: ${{ steps.os.outputs.os }}
@@ -45,24 +46,27 @@ jobs:
fi
case "$ci_type" in
docs)
os_selection='[]'
;;
quick)
os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd15-1s", "ubuntu24"]'
os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora44", "freebsd15-1s", "ubuntu26"]'
;;
linux)
os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora42", "fedora43", "fedora44", "ubuntu22", "ubuntu24"]'
os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora43", "fedora44", "ubuntu22", "ubuntu24", "ubuntu26"]'
;;
freebsd)
os_selection='["freebsd13-5r", "freebsd14-4r", "freebsd13-5s", "freebsd14-4s", "freebsd15-1s", "freebsd16-0c"]'
os_selection='["freebsd14-4r", "freebsd14-4s", "freebsd15-0r", "freebsd15-1s", "freebsd16-0c"]'
;;
*)
# default list
os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora42", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24"]'
os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora43", "fedora44", "freebsd14-4r", "freebsd15-0r", "freebsd15-1s", "freebsd16-0c", "ubuntu22", "ubuntu24", "ubuntu26"]'
;;
esac
# Repository-level override for OS selection.
# Set vars.ZTS_OS_OVERRIDE in repo settings to restrict targets
# (e.g. '["debian13"]' or '["debian13", "fedora42"]').
# (e.g. '["debian13"]' or '["debian13", "fedora44"]').
# Manual ZFS-CI-Type in commit messages bypasses the override.
if [ -n "${{ vars.ZTS_OS_OVERRIDE }}" ] && [ "$ci_source" != "manual" ]; then
override='${{ vars.ZTS_OS_OVERRIDE }}'
@@ -91,15 +95,19 @@ jobs:
qemu-vm:
name: qemu-x86
needs: [ test-config ]
if: >-
(github.event_name == 'pull_request' ||
github.repository != 'openzfs/zfs') &&
needs.test-config.outputs.ci_type != 'docs'
strategy:
fail-fast: false
matrix:
# rhl: almalinux8, almalinux9, centos-streamX, fedora4x
# debian: debian12, debian13, ubuntu22, ubuntu24
# debian: debian12, debian13, ubuntu22, ubuntu24, ubuntu26
# misc: archlinux, tumbleweed
# FreeBSD variants of november 2025:
# FreeBSD Release: freebsd13-5r, freebsd14-4r, freebsd15-0r
# FreeBSD Stable: freebsd13-5s, freebsd14-4s, freebsd15-1s
# FreeBSD Release: freebsd14-4r, freebsd15-0r
# FreeBSD Stable: freebsd14-4s, freebsd15-1s
# FreeBSD Current: freebsd16-0c
os: ${{ fromJson(needs.test-config.outputs.test_os) }}
runs-on: ubuntu-24.04
@@ -153,7 +161,10 @@ jobs:
run: .github/workflows/scripts/qemu-8-summary.sh '${{ steps.artifact-upload.outputs.artifact-url }}'
cleanup:
if: always()
if: >-
(github.event_name == 'pull_request' ||
github.repository != 'openzfs/zfs') &&
always()
name: Cleanup
runs-on: ubuntu-latest
needs: [ qemu-vm ]
+8
View File
@@ -3,6 +3,14 @@ name: zloop
on:
push:
pull_request:
paths-ignore:
- 'man/**'
- '**.md'
- 'AUTHORS'
- 'COPYRIGHT'
- 'LICENSE'
- 'NOTICE'
- '.gitignore'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+1
View File
@@ -138,6 +138,7 @@ cstyle:
! -path './include/sys/lua/*' \
! -path './module/lua/l*.[ch]' \
! -path './module/zfs/lz4.c' \
! -path './tests/unit/munit.[ch]' \
$(cstyle_line)
filter_executable = -exec test -x '{}' \; -print
+2 -2
View File
@@ -52,7 +52,7 @@ All RHEL (and compatible systems: AlmaLinux OS, Rocky Linux, etc) on the **full*
All Ubuntu **LTS** releases are supported.
**Supported Ubuntu releases**: **24.04 “Noble”**, **22.04 “Jammy”**.
**Supported Ubuntu releases**: **26.04 “Resolute”**, **24.04 “Noble”**, **22.04 “Jammy”**.
### Debian
@@ -68,4 +68,4 @@ Generally, if a distribution is following an LTS kernel, it should work well wit
All FreeBSD releases receiving [security support](https://www.freebsd.org/security/#sup) are supported by OpenZFS.
**Supported FreeBSD releases**: **15.0**, **14.4**, **13.5**.
**Supported FreeBSD releases**: **15.0**, **14.4**.
-1
View File
@@ -54,7 +54,6 @@ ztest_LDADD = \
libnvpair.la
ztest_LDADD += -lm
ztest_LDFLAGS = -pthread
include $(srcdir)/%D%/raidz_test/Makefile.am
+3 -3
View File
@@ -565,10 +565,10 @@ def init():
update_hdr_intr()
# check if L2ARC exists
# check if L2ARC exists; fall back to l2_size for older kernels that
# do not export l2_ndev
snap_stats()
l2_size = cur.get("l2_size")
if l2_size:
if cur.get("l2_ndev") or cur.get("l2_size"):
l2exist = True
if desired_cols:
+4 -1
View File
@@ -856,7 +856,10 @@ def section_l2arc(kstats_dict):
# The L2ARC statistics live in the same section as the normal ARC stuff
arc_stats = isolate_section('arcstats', kstats_dict)
if arc_stats['l2_size'] == '0':
# Skip the section only when no cache device is attached. Fall back to
# l2_size for older kernels that do not export l2_ndev.
if arc_stats.get('l2_ndev', '0') == '0' and \
arc_stats['l2_size'] == '0':
print('L2ARC not detected, skipping section\n')
return
+153 -58
View File
@@ -2802,18 +2802,18 @@ print_file_layout_raidz(vdev_t *vd, blkptr_t *bp, uint64_t file_offset,
vd->vdev_children, vdrz->vd_nparity);
raidz_row_t *rr = rm->rm_row[0];
if (!dump_opt['H']) {
int last_disk = vd->vdev_children - 1;
/*
* Account for out of order disks in raidz1.
* For now just reverse them back and adjust for it later.
*/
if (rr->rr_firstdatacol == 1 && (zio.io_offset & (1ULL << 20))) {
if (rr->rr_firstdatacol == 1 &&
(zio.io_offset & (1ULL << 20))) {
uint64_t devidx = rr->rr_col[0].rc_devidx;
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
rr->rr_col[1].rc_devidx = devidx;
}
if (!dump_opt['H']) {
int last_disk = vd->vdev_children - 1;
int first_disk = rr->rr_col[0].rc_devidx;
(void) printf("%12llx", (u_longlong_t)file_offset);
@@ -2843,23 +2843,49 @@ print_file_layout_raidz(vdev_t *vd, blkptr_t *bp, uint64_t file_offset,
static uint64_t next_offset = 0;
if (next_offset != file_offset) {
(void) printf("skip hole\t-\t%llx\n",
(u_longlong_t)((file_offset - next_offset) >>
vd->vdev_ashift));
(void) printf("skip hole\t-\t\t%lld\n",
(u_longlong_t)((file_offset - next_offset) / 512));
}
next_offset = file_offset + BP_GET_LSIZE(bp);
uint64_t tmp_offset = file_offset;
for (int c = 0; c < rr->rr_cols; c++) {
boolean_t pcol = c < rr->rr_firstdatacol;
raidz_col_t *rc = &rr->rr_col[c];
char *path = vd->vdev_child[rc->rc_devidx]->vdev_path;
// c < rr->rr_firstdatacol
if (rc->rc_size == 0)
continue;
(void) printf("%s\t%llu\t%d\n",
(void) printf("%s\t\t%llu\t%d",
zfs_basename(path),
(u_longlong_t)(rc->rc_offset +
VDEV_LABEL_START_SIZE)/512,
(int)rc->rc_size/512);
if (dump_opt['v']) {
char label = pcol ? 'P' : 'D';
int num;
if (c < 2) {
num = 0;
} else {
num = pcol ? c :
(c - rr->rr_firstdatacol);
}
printf("\t%c%d", label, num);
if (dump_opt['v'] > 1) {
unsigned long long off;
if (pcol)
off = file_offset;
else
off = tmp_offset;
off = off / 512ULL;
printf("\t%llu", off);
}
}
if (!pcol)
tmp_offset += rc->rc_size;
printf("\n");
}
}
}
@@ -2989,7 +3015,12 @@ dump_indirect_layout(dnode_t *dn)
* Start layout with a header
*/
if (dump_opt['H']) {
(void) printf("DISK\t\tLBA\t\tCOUNT\n");
(void) printf("DISK\t\t\tLBA\tCOUNT");
if (dump_opt['v'])
(void) printf("\tTYPE");
if (dump_opt['v'] > 1)
(void) printf("\tOFFSET");
printf("\n");
} else {
char diskhdr[16];
@@ -6325,22 +6356,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type)
{
int i;
boolean_t claimed = B_FALSE;
boolean_t ddt_block = B_FALSE;
boolean_t brt_block = B_FALSE;
ASSERT(type < ZDB_OT_TOTAL);
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
return;
/*
* This flag controls if we will issue a claim for the block while
* counting it, to ensure that all blocks are referenced in space maps.
* We don't issue claims if we're not doing leak tracking, because it's
* expensive if the user isn't interested. We also don't claim the
* second or later occurences of cloned or dedup'd blocks, because we
* already claimed them the first time.
*/
boolean_t do_claim = !dump_opt['L'];
spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
blkptr_t tempbp;
@@ -6371,21 +6395,30 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE);
/*
* ddt_lookup() can return NULL if this block didn't exist
* in the DDT and creating it would take the DDT over its
* quota. Since we got the block from disk, it must exist in
* the DDT, so this can't happen. However, when unique entries
* are pruned, the dedup bit can be set with no corresponding
* entry in the DDT.
* ddt_lookup() can return NULL when unique entries are pruned
* from the DDT.
*/
if (dde == NULL) {
ddt_exit(ddt);
goto skipped;
goto ddt_done;
}
/* Get the phys for this variant */
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
/*
* DDT_PHYS_NONE means the block has the dedup bit set but
* its DVA doesn't match any phys in the entry. This can
* happen when a DVA was evicted from the DDT and re-added
* on a hash collision. The block may still have a BRT entry.
*/
if (v == DDT_PHYS_NONE) {
ddt_exit(ddt);
goto ddt_done;
}
ddt_block = B_TRUE;
/*
* This entry may have multiple sets of DVAs. We must claim
* each set the first time we see them in a real block on disk,
@@ -6400,8 +6433,14 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dde->dde_io =
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
/* Consume a reference for this block. */
if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
/*
* Consume a reference. If this variant's refcount is already
* zero, the DDT tracking is exhausted more filesystem
* references exist than the DDT accounts for.
*/
boolean_t ddt_refcnt_exhausted =
(ddt_phys_refcnt(dde->dde_phys, v) == 0);
if (!ddt_refcnt_exhausted)
ddt_phys_decref(dde->dde_phys, v);
/*
@@ -6430,20 +6469,21 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
bp = &tempbp;
}
if (seen) {
if (seen && !ddt_refcnt_exhausted) {
/*
* The second or later time we see this block,
* it's a duplicate and we count it.
*/
zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
zcb->zcb_dedup_blocks++;
/* Already claimed, don't do it again. */
do_claim = B_FALSE;
claimed = B_TRUE;
}
ddt_exit(ddt);
} else if (zcb->zcb_brt_is_active &&
}
ddt_done:
if (!claimed && zcb->zcb_brt_is_active &&
brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
@@ -6451,10 +6491,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
* only claim them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
* we haven't seen before, we look it up in the real BRT. If
* we see the block again, we count it as a clone.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
@@ -6462,10 +6500,10 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre == NULL) {
/* Not seen before; track it */
uint64_t refcnt =
brt_entry_get_refcount(zcb->zcb_spa, bp);
if (refcnt > 0) {
brt_block = B_TRUE;
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
@@ -6473,25 +6511,16 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
avl_insert(&zcb->zcb_brt, zbre, where);
}
} else {
/*
* Second or later occurrence, count it and take a
* refcount.
*/
brt_block = B_TRUE;
if (zbre->zbre_refcount > 0) {
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
claimed = B_TRUE;
}
/* Already claimed, don't do it again. */
do_claim = B_FALSE;
}
}
skipped:
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -6650,12 +6679,21 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
#undef BIN
hist_skipped:
if (!do_claim)
if (claimed || dump_opt['L'])
return;
VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
int claim_err = zio_wait(zio_claim(NULL, zcb->zcb_spa,
spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
ZIO_FLAG_CANFAIL)));
ZIO_FLAG_CANFAIL));
if (claim_err != 0) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("block claim error %d%s%s: %s\n",
claim_err, brt_block ? " (BRT)" : "",
ddt_block ? " (DDT)" : "", blkbuf);
zcb->zcb_haderrors = 1;
zcb->zcb_errors[claim_err]++;
}
}
static void
@@ -7431,10 +7469,66 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
static boolean_t
zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
{
if (dump_opt['L'])
return (B_FALSE);
boolean_t leaks = B_FALSE;
/*
* Report leaked BRT entries whose refcount was not fully consumed by
* the traversal.
*/
if (zcb->zcb_brt_is_active) {
void *cookie = NULL;
zdb_brt_entry_t *zbre;
while ((zbre = avl_destroy_nodes(
&zcb->zcb_brt, &cookie)) != NULL) {
if (!dump_opt['L'] && zbre->zbre_refcount != 0) {
(void) printf("BRT leak: vdev %llu, "
"offset 0x%llx, refcount %llu\n",
(u_longlong_t)DVA_GET_VDEV(
&zbre->zbre_dva),
(u_longlong_t)DVA_GET_OFFSET(
&zbre->zbre_dva),
(u_longlong_t)zbre->zbre_refcount);
leaks = B_TRUE;
}
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
avl_destroy(&zcb->zcb_brt);
}
if (dump_opt['L'])
return (leaks);
/*
* Report leaked DDT entries whose refcount was not fully consumed by
* the traversal. Entries in the DDT ZAP that were never looked up
* are not detected here.
*/
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL)
continue;
ddt_enter(ddt);
for (ddt_entry_t *dde = avl_first(&ddt->ddt_tree); dde != NULL;
dde = AVL_NEXT(&ddt->ddt_tree, dde)) {
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
uint64_t refcnt = ddt_phys_refcnt(dde->dde_phys,
v);
if (refcnt == 0)
continue;
blkptr_t blk;
char blkbuf[BP_SPRINTF_LEN];
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
dde->dde_phys, v, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("DDT leak: refcount %llu %s\n",
(u_longlong_t)refcnt, blkbuf);
leaks = B_TRUE;
}
}
ddt_exit(ddt);
}
vdev_t *rvd = spa->spa_root_vdev;
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
@@ -10136,7 +10230,7 @@ main(int argc, char **argv)
* Automate cachefile
*/
if (!spa_config_path_env && !config_path_console && target &&
libzfs_core_init() == 0) {
!dump_opt['l'] && libzfs_core_init() == 0) {
char *pname = strdup(target);
const char *value;
nvlist_t *pnvl = NULL;
@@ -10519,6 +10613,7 @@ main(int argc, char **argv)
}
if (dump_opt['f'] && os != NULL) {
dump_opt['v'] = verbose;
dump_file_data_layout(os);
} else if (dump_opt['B']) {
dump_backup(target, objset_id,
-1
View File
@@ -41,6 +41,5 @@ zed_LDADD = \
libnvpair.la
zed_LDADD += -lrt $(LIBATOMIC_LIBS) $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
zed_LDFLAGS = -pthread
dist_noinst_DATA += %D%/agents/README.md
@@ -350,6 +350,60 @@ is_draid_fdomain_failure(fmd_hdl_t *hdl, libzfs_handle_t *zhdl,
return (res);
}
/*
* Returns B_TRUE if spare 'a' should be tried before spare 'b' when
* replacing a failed vdev with the given characteristics.
*
* Ordering criteria (most to least significant):
* 1. Distributed spare matching the failed vdev's dRAID is preferred
* most (distributed spares rebuild faster than traditional spares).
* Regular spares (no TOP_GUID) come next. Non-matching distributed
* spares are tried last, as the kernel will reject them anyway.
* 2. Matching rotational is preferred over mismatching.
* 3. Large enough is preferred over too small.
* 4. Smaller size is preferred over bigger (best fit).
*/
static boolean_t
spare_is_preferred(nvlist_t *a, nvlist_t *b, boolean_t have_rotational,
uint64_t vdev_rotational, uint64_t vdev_size, uint64_t top_guid)
{
uint64_t a_top = 0, b_top = 0;
(void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_TOP_GUID, &a_top);
(void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_TOP_GUID, &b_top);
int a_pri = (a_top == 0) ? 1 :
(a_top == top_guid || top_guid == 0) ? 2 : 0;
int b_pri = (b_top == 0) ? 1 :
(b_top == top_guid || top_guid == 0) ? 2 : 0;
if (a_pri != b_pri)
return (a_pri > b_pri);
if (have_rotational) {
uint64_t a_rotational = 0, b_rotational = 0;
(void) nvlist_lookup_uint64(a, ZPOOL_CONFIG_VDEV_ROTATIONAL,
&a_rotational);
(void) nvlist_lookup_uint64(b, ZPOOL_CONFIG_VDEV_ROTATIONAL,
&b_rotational);
if ((a_rotational == vdev_rotational) !=
(b_rotational == vdev_rotational))
return (a_rotational == vdev_rotational);
}
vdev_stat_t *vs;
unsigned int c;
uint64_t a_size = 0, b_size = 0;
if (nvlist_lookup_uint64_array(a, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0)
a_size = vs->vs_rsize;
if (nvlist_lookup_uint64_array(b, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0)
b_size = vs->vs_rsize;
boolean_t a_ok = (a_size >= vdev_size);
boolean_t b_ok = (b_size >= vdev_size);
if (a_ok != b_ok)
return (a_ok);
return (a_size < b_size);
}
/*
* Given a vdev, attempt to replace it with every known spare until one
* succeeds or we run out of devices to try.
@@ -364,6 +418,10 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
char *dev_name;
zprop_source_t source;
int ashift;
uint64_t vdev_rotational = 0, vdev_size = 0, top_guid = 0;
boolean_t have_vdev_rotational;
vdev_stat_t *vs;
unsigned int c;
config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
@@ -377,6 +435,35 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
&spares, &nspares) != 0)
return (B_FALSE);
/*
* Collect the failed vdev's parameters for optimal replacement.
*/
have_vdev_rotational = (nvlist_lookup_uint64(vdev,
ZPOOL_CONFIG_VDEV_ROTATIONAL, &vdev_rotational) == 0);
if (nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0)
vdev_size = vs->vs_rsize;
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_TOP_GUID, &top_guid);
/*
* Build a sorted index array over the spares, so that better
* candicates are tried first.
*/
uint_t order[nspares];
for (s = 0; s < nspares; s++)
order[s] = s;
for (s = 1; s < nspares; s++) {
uint_t key = order[s];
int j = (int)s - 1;
while (j >= 0 && spare_is_preferred(spares[key],
spares[order[j]], have_vdev_rotational, vdev_rotational,
vdev_size, top_guid)) {
order[j + 1] = order[j];
j--;
}
order[j + 1] = key;
}
/*
* lookup "ashift" pool property, we may need it for the replacement
*/
@@ -394,25 +481,26 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
* replace it.
*/
for (s = 0; s < nspares; s++) {
nvlist_t *spare = spares[order[s]];
boolean_t rebuild = B_FALSE;
const char *spare_name, *type;
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
if (nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH,
&spare_name) != 0)
continue;
/* prefer sequential resilvering for distributed spares */
if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
if ((nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE,
&type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
rebuild = B_TRUE;
/* if set, add the "ashift" pool property to the spare nvlist */
if (source != ZPROP_SRC_DEFAULT)
(void) nvlist_add_uint64(spares[s],
(void) nvlist_add_uint64(spare,
ZPOOL_CONFIG_ASHIFT, ashift);
(void) nvlist_add_nvlist_array(replacement,
ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1);
ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spare, 1);
fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
dev_name, zfs_basename(spare_name));
+12
View File
@@ -9399,6 +9399,18 @@ main(int argc, char **argv)
return (1);
}
/*
* Special case '<subcommand> --help|-?'
*/
if (argc >= 3 && (strcmp(argv[2], "--help") == 0 ||
strcmp(argv[2], "-?") == 0)) {
int idx;
if (find_command_idx(cmdname, &idx) == 0) {
current_command = &command_table[idx];
usage(B_FALSE);
}
}
zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
libzfs_print_on_error(g_zfs, B_TRUE);
@@ -13878,6 +13878,18 @@ main(int argc, char **argv)
if (strcmp(cmdname, "help") == 0)
return (zpool_do_help(argc, argv));
/*
* Special case '<subcommand> --help|-?'
*/
if (argc >= 3 && (strcmp(argv[2], "--help") == 0 ||
strcmp(argv[2], "-?") == 0)) {
int idx;
if (find_command_idx(cmdname, &idx) == 0) {
current_command = &command_table[idx];
usage(B_FALSE);
}
}
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
return (1);
+36
View File
@@ -29,6 +29,8 @@
#include <libintl.h>
#include <stddef.h>
#include <libzfs.h>
#include <signal.h>
#include <sys/backtrace.h>
#include "zstream.h"
void
@@ -53,9 +55,43 @@ zstream_usage(void)
exit(1);
}
static void sig_handler(int signo)
{
struct sigaction action;
libspl_backtrace(STDERR_FILENO);
/*
* Restore default action and re-raise signal so SIGSEGV and
* SIGABRT can trigger a core dump.
*/
action.sa_handler = SIG_DFL;
sigemptyset(&action.sa_mask);
action.sa_flags = 0;
(void) sigaction(signo, &action, NULL);
raise(signo);
}
int
main(int argc, char *argv[])
{
/*
* Set up signal handlers, so if we crash due to bad data in the stream
* we can get more info. Unlike ztest, we don't bail out if we can't
* set up signal handlers, because zstream is very useful without them.
*/
struct sigaction action = { .sa_handler = sig_handler };
sigemptyset(&action.sa_mask);
action.sa_flags = 0;
if (sigaction(SIGSEGV, &action, NULL) < 0) {
(void) fprintf(stderr, "zstream: cannot catch SIGSEGV: %s\n",
strerror(errno));
}
if (sigaction(SIGABRT, &action, NULL) < 0) {
(void) fprintf(stderr, "zstream: cannot catch SIGABRT: %s\n",
strerror(errno));
}
char *basename = strrchr(argv[0], '/');
basename = basename ? (basename + 1) : argv[0];
if (argc >= 1 && strcmp(basename, "zstreamdump") == 0)
@@ -385,6 +385,20 @@ zstream_do_dump(int argc, char *argv[])
(void) ssread(buf, sz, &zc);
if (ferror(send_stream))
perror("fread");
uint8_t *nv_header = (uint8_t *)buf;
boolean_t xdr = nv_header[0] == NV_ENCODE_XDR;
boolean_t big_endian = nv_header[1] == 0;
const char *nc;
if (xdr) {
nc = "NV_ENCODE_XDR";
} else if (big_endian) {
nc = "NV_ENCODE_NATIVE (big-endian)";
} else {
nc = "NV_ENCODE_NATIVE (little-endian)";
}
printf("nvlist encoding = %s\n", nc);
err = nvlist_unpack(buf, sz, &nv, 0);
if (err) {
perror(strerror(err));
@@ -99,6 +99,7 @@ zstream_do_recompress(int argc, char *argv[])
exit(1);
}
zfs_refcount_init();
abd_init();
fletcher_4_init();
zio_init();
@@ -353,6 +354,7 @@ zstream_do_recompress(int argc, char *argv[])
zio_fini();
zstd_fini();
abd_fini();
zfs_refcount_fini();
return (0);
}
+2
View File
@@ -23,6 +23,7 @@ AM_CFLAGS += $(IMPLICIT_FALLTHROUGH)
AM_CFLAGS += $(DEBUG_CFLAGS)
AM_CFLAGS += $(ASAN_CFLAGS)
AM_CFLAGS += $(UBSAN_CFLAGS)
AM_CFLAGS += $(PTHREAD_CFLAGS)
AM_CFLAGS += $(CODE_COVERAGE_CFLAGS)
AM_CFLAGS += $(NO_FORMAT_ZERO_LENGTH)
AM_CFLAGS += $(NO_FORMAT_TRUNCATION)
@@ -57,6 +58,7 @@ endif
AM_LDFLAGS = $(DEBUG_LDFLAGS)
AM_LDFLAGS += $(ASAN_LDFLAGS)
AM_LDFLAGS += $(UBSAN_LDFLAGS)
AM_LDFLAGS += $(PTHREAD_LIBS)
if BUILD_FREEBSD
AM_LDFLAGS += -fstack-protector-strong
+523
View File
@@ -0,0 +1,523 @@
# SPDX-License-Identifier: GPL-3.0-or-later WITH Autoconf-exception-macro
# ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_pthread.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
#
# DESCRIPTION
#
# This macro figures out how to build C programs using POSIX threads. It
# sets the PTHREAD_LIBS output variable to the threads library and linker
# flags, and the PTHREAD_CFLAGS output variable to any special C compiler
# flags that are needed. (The user can also force certain compiler
# flags/libs to be tested by setting these environment variables.)
#
# Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is
# needed for multi-threaded programs (defaults to the value of CC
# respectively CXX otherwise). (This is necessary on e.g. AIX to use the
# special cc_r/CC_r compiler alias.)
#
# NOTE: You are assumed to not only compile your program with these flags,
# but also to link with them as well. For example, you might link with
# $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
# $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
#
# If you are only building threaded programs, you may wish to use these
# variables in your default LIBS, CFLAGS, and CC:
#
# LIBS="$PTHREAD_LIBS $LIBS"
# CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
# CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS"
# CC="$PTHREAD_CC"
# CXX="$PTHREAD_CXX"
#
# In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
# has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
# that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
#
# Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
# PTHREAD_PRIO_INHERIT symbol is defined when compiling with
# PTHREAD_CFLAGS.
#
# ACTION-IF-FOUND is a list of shell commands to run if a threads library
# is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
# is not found. If ACTION-IF-FOUND is not specified, the default action
# will define HAVE_PTHREAD.
#
# Please let the authors know if this macro fails on any platform, or if
# you have any other suggestions or comments. This macro was based on work
# by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help
# from M. Frigo), as well as ac_pthread and hb_pthread macros posted by
# Alejandro Forero Cuervo to the autoconf macro repository. We are also
# grateful for the helpful feedback of numerous users.
#
# Updated for Autoconf 2.68 by Daniel Richard G.
#
# LICENSE
#
# Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
# Copyright (c) 2011 Daniel Richard G. <skunk@iSKUNK.ORG>
# Copyright (c) 2019 Marc Stevens <marc.stevens@cwi.nl>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
#
# As a special exception, the respective Autoconf Macro's copyright owner
# gives unlimited permission to copy, distribute and modify the configure
# scripts that are the output of Autoconf when processing the Macro. You
# need not follow the terms of the GNU General Public License when using
# or distributing such scripts, even though portions of the text of the
# Macro appear in them. The GNU General Public License (GPL) does govern
# all other use of the material that constitutes the Autoconf Macro.
#
# This special exception to the GPL applies to versions of the Autoconf
# Macro released by the Autoconf Archive. When you make and distribute a
# modified version of the Autoconf Macro, you may extend this special
# exception to the GPL to apply to your modified version as well.
#serial 31
AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
AC_DEFUN([AX_PTHREAD], [
AC_REQUIRE([AC_CANONICAL_HOST])
AC_REQUIRE([AC_PROG_CC])
AC_REQUIRE([AC_PROG_SED])
AC_LANG_PUSH([C])
ax_pthread_ok=no
# We used to check for pthread.h first, but this fails if pthread.h
# requires special compiler flags (e.g. on Tru64 or Sequent).
# It gets checked for in the link test anyway.
# First of all, check if the user has set any of the PTHREAD_LIBS,
# etcetera environment variables, and if threads linking works using
# them:
if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
ax_pthread_save_CC="$CC"
ax_pthread_save_CFLAGS="$CFLAGS"
ax_pthread_save_LIBS="$LIBS"
AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"])
AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"])
CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
LIBS="$PTHREAD_LIBS $LIBS"
AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS])
AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes])
AC_MSG_RESULT([$ax_pthread_ok])
if test "x$ax_pthread_ok" = "xno"; then
PTHREAD_LIBS=""
PTHREAD_CFLAGS=""
fi
CC="$ax_pthread_save_CC"
CFLAGS="$ax_pthread_save_CFLAGS"
LIBS="$ax_pthread_save_LIBS"
fi
# We must check for the threads library under a number of different
# names; the ordering is very important because some systems
# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
# libraries is broken (non-POSIX).
# Create a list of thread flags to try. Items with a "," contain both
# C compiler flags (before ",") and linker flags (after ","). Other items
# starting with a "-" are C compiler flags, and remaining items are
# library names, except for "none" which indicates that we try without
# any flags at all, and "pthread-config" which is a program returning
# the flags for the Pth emulation library.
ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
# The ordering *is* (sometimes) important. Some notes on the
# individual items follow:
# pthreads: AIX (must check this before -lpthread)
# none: in case threads are in libc; should be tried before -Kthread and
# other compiler flags to prevent continual compiler warnings
# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64
# (Note: HP C rejects this with "bad form for `-t' option")
# -pthreads: Solaris/gcc (Note: HP C also rejects)
# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
# doesn't hurt to check since this sometimes defines pthreads and
# -D_REENTRANT too), HP C (must be checked before -lpthread, which
# is present but should not be used directly; and before -mthreads,
# because the compiler interprets this as "-mt" + "-hreads")
# -mthreads: Mingw32/gcc, Lynx/gcc
# pthread: Linux, etcetera
# --thread-safe: KAI C++
# pthread-config: use pthread-config program (for GNU Pth library)
case $host_os in
freebsd*)
# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
ax_pthread_flags="-kthread lthread $ax_pthread_flags"
;;
hpux*)
# From the cc(1) man page: "[-mt] Sets various -D flags to enable
# multi-threading and also sets -lpthread."
ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags"
;;
openedition*)
# IBM z/OS requires a feature-test macro to be defined in order to
# enable POSIX threads at all, so give the user a hint if this is
# not set. (We don't define these ourselves, as they can affect
# other portions of the system API in unpredictable ways.)
AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING],
[
# if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS)
AX_PTHREAD_ZOS_MISSING
# endif
],
[AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])])
;;
solaris*)
# On Solaris (at least, for some versions), libc contains stubbed
# (non-functional) versions of the pthreads routines, so link-based
# tests will erroneously succeed. (N.B.: The stubs are missing
# pthread_cleanup_push, or rather a function called by this macro,
# so we could check for that, but who knows whether they'll stub
# that too in a future libc.) So we'll check first for the
# standard Solaris way of linking pthreads (-mt -lpthread).
ax_pthread_flags="-mt,-lpthread pthread $ax_pthread_flags"
;;
esac
# Are we compiling with Clang?
AC_CACHE_CHECK([whether $CC is Clang],
[ax_cv_PTHREAD_CLANG],
[ax_cv_PTHREAD_CLANG=no
# Note that Autoconf sets GCC=yes for Clang as well as GCC
if test "x$GCC" = "xyes"; then
AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG],
[/* Note: Clang 2.7 lacks __clang_[a-z]+__ */
# if defined(__clang__) && defined(__llvm__)
AX_PTHREAD_CC_IS_CLANG
# endif
],
[ax_cv_PTHREAD_CLANG=yes])
fi
])
ax_pthread_clang="$ax_cv_PTHREAD_CLANG"
# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC)
# Note that for GCC and Clang -pthread generally implies -lpthread,
# except when -nostdlib is passed.
# This is problematic using libtool to build C++ shared libraries with pthread:
# [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=25460
# [2] https://bugzilla.redhat.com/show_bug.cgi?id=661333
# [3] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=468555
# To solve this, first try -pthread together with -lpthread for GCC
AS_IF([test "x$GCC" = "xyes"],
[ax_pthread_flags="-pthread,-lpthread -pthread -pthreads $ax_pthread_flags"])
# Clang takes -pthread (never supported any other flag), but we'll try with -lpthread first
AS_IF([test "x$ax_pthread_clang" = "xyes"],
[ax_pthread_flags="-pthread,-lpthread -pthread"])
# The presence of a feature test macro requesting re-entrant function
# definitions is, on some systems, a strong hint that pthreads support is
# correctly enabled
case $host_os in
darwin* | hpux* | linux* | osf* | solaris*)
ax_pthread_check_macro="_REENTRANT"
;;
aix*)
ax_pthread_check_macro="_THREAD_SAFE"
;;
*)
ax_pthread_check_macro="--"
;;
esac
AS_IF([test "x$ax_pthread_check_macro" = "x--"],
[ax_pthread_check_cond=0],
[ax_pthread_check_cond="!defined($ax_pthread_check_macro)"])
if test "x$ax_pthread_ok" = "xno"; then
for ax_pthread_try_flag in $ax_pthread_flags; do
case $ax_pthread_try_flag in
none)
AC_MSG_CHECKING([whether pthreads work without any flags])
;;
*,*)
PTHREAD_CFLAGS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\1/"`
PTHREAD_LIBS=`echo $ax_pthread_try_flag | sed "s/^\(.*\),\(.*\)$/\2/"`
AC_MSG_CHECKING([whether pthreads work with "$PTHREAD_CFLAGS" and "$PTHREAD_LIBS"])
;;
-*)
AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag])
PTHREAD_CFLAGS="$ax_pthread_try_flag"
;;
pthread-config)
AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no])
AS_IF([test "x$ax_pthread_config" = "xno"], [continue])
PTHREAD_CFLAGS="`pthread-config --cflags`"
PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
;;
*)
AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag])
PTHREAD_LIBS="-l$ax_pthread_try_flag"
;;
esac
ax_pthread_save_CFLAGS="$CFLAGS"
ax_pthread_save_LIBS="$LIBS"
CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
LIBS="$PTHREAD_LIBS $LIBS"
# Check for various functions. We must include pthread.h,
# since some functions may be macros. (On the Sequent, we
# need a special flag -Kthread to make this header compile.)
# We check for pthread_join because it is in -lpthread on IRIX
# while pthread_create is in libc. We check for pthread_attr_init
# due to DEC craziness with -lpthreads. We check for
# pthread_cleanup_push because it is one of the few pthread
# functions on Solaris that doesn't have a non-functional libc stub.
# We try pthread_create on general principles.
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>
# if $ax_pthread_check_cond
# error "$ax_pthread_check_macro must be defined"
# endif
static void *some_global = NULL;
static void routine(void *a)
{
/* To avoid any unused-parameter or
unused-but-set-parameter warning. */
some_global = a;
}
static void *start_routine(void *a) { return a; }],
[pthread_t th; pthread_attr_t attr;
pthread_create(&th, 0, start_routine, 0);
pthread_join(th, 0);
pthread_attr_init(&attr);
pthread_cleanup_push(routine, 0);
pthread_cleanup_pop(0) /* ; */])],
[ax_pthread_ok=yes],
[])
CFLAGS="$ax_pthread_save_CFLAGS"
LIBS="$ax_pthread_save_LIBS"
AC_MSG_RESULT([$ax_pthread_ok])
AS_IF([test "x$ax_pthread_ok" = "xyes"], [break])
PTHREAD_LIBS=""
PTHREAD_CFLAGS=""
done
fi
# Clang needs special handling, because older versions handle the -pthread
# option in a rather... idiosyncratic way
if test "x$ax_pthread_clang" = "xyes"; then
# Clang takes -pthread; it has never supported any other flag
# (Note 1: This will need to be revisited if a system that Clang
# supports has POSIX threads in a separate library. This tends not
# to be the way of modern systems, but it's conceivable.)
# (Note 2: On some systems, notably Darwin, -pthread is not needed
# to get POSIX threads support; the API is always present and
# active. We could reasonably leave PTHREAD_CFLAGS empty. But
# -pthread does define _REENTRANT, and while the Darwin headers
# ignore this macro, third-party headers might not.)
# However, older versions of Clang make a point of warning the user
# that, in an invocation where only linking and no compilation is
# taking place, the -pthread option has no effect ("argument unused
# during compilation"). They expect -pthread to be passed in only
# when source code is being compiled.
#
# Problem is, this is at odds with the way Automake and most other
# C build frameworks function, which is that the same flags used in
# compilation (CFLAGS) are also used in linking. Many systems
# supported by AX_PTHREAD require exactly this for POSIX threads
# support, and in fact it is often not straightforward to specify a
# flag that is used only in the compilation phase and not in
# linking. Such a scenario is extremely rare in practice.
#
# Even though use of the -pthread flag in linking would only print
# a warning, this can be a nuisance for well-run software projects
# that build with -Werror. So if the active version of Clang has
# this misfeature, we search for an option to squash it.
AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread],
[ax_cv_PTHREAD_CLANG_NO_WARN_FLAG],
[ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown
# Create an alternate version of $ac_link that compiles and
# links in two steps (.c -> .o, .o -> exe) instead of one
# (.c -> exe), because the warning occurs only in the second
# step
ax_pthread_save_ac_link="$ac_link"
ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g'
ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"`
ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)"
ax_pthread_save_CFLAGS="$CFLAGS"
for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do
AS_IF([test "x$ax_pthread_try" = "xunknown"], [break])
CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS"
ac_link="$ax_pthread_save_ac_link"
AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
[ac_link="$ax_pthread_2step_ac_link"
AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
[break])
])
done
ac_link="$ax_pthread_save_ac_link"
CFLAGS="$ax_pthread_save_CFLAGS"
AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no])
ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try"
])
case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in
no | unknown) ;;
*) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;;
esac
fi # $ax_pthread_clang = yes
# Various other checks:
if test "x$ax_pthread_ok" = "xyes"; then
ax_pthread_save_CFLAGS="$CFLAGS"
ax_pthread_save_LIBS="$LIBS"
CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
LIBS="$PTHREAD_LIBS $LIBS"
# Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
AC_CACHE_CHECK([for joinable pthread attribute],
[ax_cv_PTHREAD_JOINABLE_ATTR],
[ax_cv_PTHREAD_JOINABLE_ATTR=unknown
for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>],
[int attr = $ax_pthread_attr; return attr /* ; */])],
[ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break],
[])
done
])
AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \
test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \
test "x$ax_pthread_joinable_attr_defined" != "xyes"],
[AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE],
[$ax_cv_PTHREAD_JOINABLE_ATTR],
[Define to necessary symbol if this constant
uses a non-standard name on your system.])
ax_pthread_joinable_attr_defined=yes
])
AC_CACHE_CHECK([whether more special flags are required for pthreads],
[ax_cv_PTHREAD_SPECIAL_FLAGS],
[ax_cv_PTHREAD_SPECIAL_FLAGS=no
case $host_os in
solaris*)
ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS"
;;
esac
])
AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \
test "x$ax_pthread_special_flags_added" != "xyes"],
[PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS"
ax_pthread_special_flags_added=yes])
AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
[ax_cv_PTHREAD_PRIO_INHERIT],
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],
[[int i = PTHREAD_PRIO_INHERIT;
return i;]])],
[ax_cv_PTHREAD_PRIO_INHERIT=yes],
[ax_cv_PTHREAD_PRIO_INHERIT=no])
])
AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \
test "x$ax_pthread_prio_inherit_defined" != "xyes"],
[AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])
ax_pthread_prio_inherit_defined=yes
])
CFLAGS="$ax_pthread_save_CFLAGS"
LIBS="$ax_pthread_save_LIBS"
# More AIX lossage: compile with *_r variant
if test "x$GCC" != "xyes"; then
case $host_os in
aix*)
AS_CASE(["x/$CC"],
[x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
[#handle absolute path differently from PATH based program lookup
AS_CASE(["x$CC"],
[x/*],
[
AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])
AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])])
],
[
AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])
AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])])
]
)
])
;;
esac
fi
fi
test -n "$PTHREAD_CC" || PTHREAD_CC="$CC"
test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX"
AC_SUBST([PTHREAD_LIBS])
AC_SUBST([PTHREAD_CFLAGS])
AC_SUBST([PTHREAD_CC])
AC_SUBST([PTHREAD_CXX])
# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
if test "x$ax_pthread_ok" = "xyes"; then
ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1])
:
else
ax_pthread_ok=no
$2
fi
AC_LANG_POP
])dnl AX_PTHREAD
@@ -0,0 +1,34 @@
dnl # SPDX-License-Identifier: CDDL-1.0
dnl #
dnl # 5.6 API change
dnl # Before 5.6, fs_parse() took a struct fs_parameter_description
dnl # which wraps the parameter specs with name and enum pointers. From 5.6,
dnl # the description struct was removed and fs_parse() accepts the
dnl # fs_parameter_spec directly.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_FS_PARSE], [
ZFS_LINUX_TEST_SRC([fs_parse], [
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
],[
static const struct fs_parameter_spec specs[] = {
{}
};
int test __attribute__ ((unused));
struct fs_context *fc __attribute__ ((unused)) = NULL;
struct fs_parameter param __attribute__ ((unused));
struct fs_parse_result result __attribute__ ((unused));
test = fs_parse(fc, specs, &param, &result);
])
])
AC_DEFUN([ZFS_AC_KERNEL_FS_PARSE], [
AC_MSG_CHECKING([whether fs_parse() takes fs_parameter_spec directly])
ZFS_LINUX_TEST_RESULT([fs_parse], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_FS_PARSE_TAKES_SPEC, 1,
[fs_parse() takes fs_parameter_spec directly])
],[
AC_MSG_RESULT(no)
])
])
+113 -3
View File
@@ -78,6 +78,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE
ZFS_AC_KERNEL_SRC_SECURITY_INODE
ZFS_AC_KERNEL_SRC_FS_CONTEXT
ZFS_AC_KERNEL_SRC_FS_PARSE
ZFS_AC_KERNEL_SRC_SB_DYING
ZFS_AC_KERNEL_SRC_SET_NLINK
ZFS_AC_KERNEL_SRC_SGET
@@ -153,9 +154,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
;;
esac
AC_MSG_CHECKING([for available kernel interfaces])
ZFS_LINUX_TEST_COMPILE_ALL([kabi])
AC_MSG_RESULT([done])
ZFS_LINUX_TEST_COMPILE_ALL([kabi], [for available kernel interfaces])
])
dnl #
@@ -203,6 +202,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_TRUNCATE_SETSIZE
ZFS_AC_KERNEL_SECURITY_INODE
ZFS_AC_KERNEL_FS_CONTEXT
ZFS_AC_KERNEL_FS_PARSE
ZFS_AC_KERNEL_SB_DYING
ZFS_AC_KERNEL_SET_NLINK
ZFS_AC_KERNEL_SGET
@@ -753,6 +753,108 @@ AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [
], [], [yes])
])
dnl #
dnl # Progress output for ZFS_LINUX_TEST_COMPILE_ALL
dnl #
dnl # From clean, we currently have ~250 kernel tests to compile. This can
dnl # take anywhere from a few seconds to a few minutes while we wait for
dnl # the module build invocation to complete (see ZFS_LINUX_COMPILE).
dnl #
dnl # To show some progress in the main set of tests, we start a background
dnl # job to monitor the build progress and update the output.
dnl #
AC_DEFUN([_ZFS_LINUX_TEST_COMPILE_PROGRESS_START], [
dnl # normal "checking for..." output
AC_MSG_CHECKING([$2])
dnl # don't start the background job if configure was called with
dnl # --silent or --quiet, or if configure's output stream is not
dnl # attached to a terminal
AS_IF([test "x$silent" != "xyes" -a -t AS_MESSAGE_FD], [
dnl # save "checking" message for cleanup later
_zfs_linux_test_progress_text="$2"
dnl # new shell job in background
(
dnl # ZFS_LINUX_CONFTEST_MAKEFILE adds one line per
dnl # test to the top Makefile, so the line count
dnl # is our target
total=$(wc -l < $1/Makefile)
count=0
dnl # eject if our parent process has gone away. this
dnl # is protection against the parent being killed.
dnl # (we can't use trap because autoconf generates
dnl # that and doesn't provide an easy way to hook it).
while kill -0 $$ 2>/dev/null ; do
dnl # ZFS_LINUX_TEST_COMPILE_ALL has a short
dnl # second stage for modpost, where build.log
dnl # recreated. we make some effort to both
dnl # detect that and handle it, mostly by
dnl # making sure the counter never goes
dnl # backwards.
if test "$count" -lt "$total" ; then
dnl # if build.log went away, then
dnl # we never got to do a last count,
dnl # so we can assume they're all
dnl # finished and just bump the count
dnl # to the total
if ! test -f $1/build.log ; then
count=$total
else
dnl # look for compilation lines
dnl # (CC) for .o files that
dnl # are in a dir (so not
dnl # whole-of-build artifacts)
dnl # and only have a a single
dnl # period (so not .mod.o
dnl # link artifacts)
count_n=$(awk '/CC/ && /\/[[^\.]]+\.o$/ { c++ } END { print c }' $1/build.log 2>/dev/null)
if test "x$count_n" != "x" ; then
dnl # empty output
dnl # means awk failed,
dnl # likely build.log
dnl # went away. use
dnl # the current count
count=$count_n
fi
fi
dnl # re-output the entire message with
dnl # the new counts
printf '\rchecking %s... %d/%d' "$2" "$count" "$total" >&6
fi
dnl # yield before loop
sleep 0.5
done
) &
dnl # save the pid so we can kill it later
_zfs_linux_test_progress_pid=$!
])
])
AC_DEFUN([_ZFS_LINUX_TEST_COMPILE_PROGRESS_DONE], [
dnl # only do cleanup if we actually started the job
AS_IF([test "x$_zfs_linux_test_progress_pid" != "x"], [
dnl # kill it; no-op if it already died
kill $_zfs_linux_test_progress_pid 2>/dev/null
dnl # wait for it to really go away and clean it up
wait $_zfs_linux_test_progress_pid 2>/dev/null
dnl # reprint the original checking line. the control code
dnl # is ANSI "erase entire line"
printf '\r\033\1332Kchecking %s... ' "$_zfs_linux_test_progress_text" >&AS_MESSAGE_FD
dnl # cleanup for next run
_zfs_linux_test_progress_pid=
_zfs_linux_test_progress_text=
])
dnl # normal final output for screen and config.log
AC_MSG_RESULT([$1])
])
dnl #
dnl # Perform the compilation of the test cases in two phases.
dnl #
@@ -771,6 +873,10 @@ dnl # The maximum allowed parallelism can be controlled by setting the
dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc).
dnl #
AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [
AS_IF([test "x$2" != "x"], [
_ZFS_LINUX_TEST_COMPILE_PROGRESS_START([build], [$2])
])
dnl # Phase 1 - Compilation only, final linking is skipped.
ZFS_LINUX_TEST_COMPILE([$1], [build])
@@ -818,6 +924,10 @@ AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [
])
done
])
AS_IF([test "x$2" != "x"], [
_ZFS_LINUX_TEST_COMPILE_PROGRESS_DONE([done])
])
])
dnl #
+12
View File
@@ -39,6 +39,18 @@ dnl # (If INVARIANTS is detected, we need to force DEBUG, or strange panics
dnl # can ensue.)
dnl #
AC_DEFUN([ZFS_AC_DEBUG], [
dnl #
dnl # In the Linux kernel copy-builtin build, assertion/debug support
dnl # is selected by CONFIG_ZFS_DEBUG (Kconfig).
dnl #
AH_BOTTOM([
#ifdef CONFIG_ZFS
#undef ZFS_DEBUG
#ifdef CONFIG_ZFS_DEBUG
#define ZFS_DEBUG 1
#endif
#endif])
AC_MSG_CHECKING([whether assertion support will be enabled])
AC_ARG_ENABLE([debug],
[AS_HELP_STRING([--enable-debug],
+1
View File
@@ -54,6 +54,7 @@ AC_PROG_LN_S
PKG_PROG_PKG_CONFIG
AM_PROG_AS
AM_PROG_CC_C_O
AX_PTHREAD
AX_CODE_COVERAGE
_AM_PROG_TAR(pax)
@@ -2,7 +2,6 @@ usr/bin/zarcsummary.py
usr/share/zfs/zfs-helpers.sh
etc/default/zfs
etc/init.d
etc/sudoers.d
etc/zfs/vdev_id.conf.alias.example
etc/zfs/vdev_id.conf.multipath.example
etc/zfs/vdev_id.conf.sas_direct.example
@@ -840,27 +840,41 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh,
errno);
return (-1);
}
if (chown(runtime_path, 0, 0) != 0) {
pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d",
errno);
const int runtime_fd = open(runtime_path,
O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_DIRECTORY);
if (runtime_fd < 0) {
pam_syslog(pamh, LOG_ERR, "Can't open runtime path: %d", errno);
return (-1);
}
if (chmod(runtime_path, S_IRWXU) != 0) {
if (fchown(runtime_fd, 0, 0) != 0) {
pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d",
errno);
close(runtime_fd);
return (-1);
}
if (fchmod(runtime_fd, S_IRWXU) != 0) {
pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d",
errno);
close(runtime_fd);
return (-1);
}
char *counter_path;
if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1)
if (asprintf(&counter_path, "%u", config->uid) == -1) {
close(runtime_fd);
return (-1);
}
const int fd = open(counter_path,
const int fd = openat(runtime_fd, counter_path,
O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW,
S_IRUSR | S_IWUSR);
int ret = errno;
free(counter_path);
close(runtime_fd);
if (fd < 0) {
pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno);
pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", ret);
return (-1);
}
if (flock(fd, LOCK_EX) != 0) {
@@ -871,7 +885,6 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh,
char counter[20];
char *pos = counter;
int remaining = sizeof (counter) - 1;
int ret;
counter[sizeof (counter) - 1] = 0;
while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) {
remaining -= ret;
+11
View File
@@ -43,6 +43,17 @@ config ZFS
To compile this file system support as a module, choose M here.
If unsure, say N.
config ZFS_DEBUG
bool "ZFS debugging"
depends on ZFS
help
Enable ZFS debugging. This turns on all ASSERT() assertions,
enables additional debug-only code paths, and promotes
compiler warnings to errors. This should only be enabled for
development or troubleshooting.
If unsure, say N.
EOF
-8
View File
@@ -1,10 +1,4 @@
# SPDX-License-Identifier: CDDL-1.0
sudoersddir = $(sysconfdir)/sudoers.d
sudoersd_DATA = \
%D%/sudoers.d/zfs
dist_noinst_DATA += $(sudoersd_DATA)
sysconf_zfsdir = $(sysconfdir)/zfs
@@ -88,8 +82,6 @@ systemdgenerator_PROGRAMS = \
%C%_systemd_system_generators_zfs_mount_generator_LDADD = \
libzfs.la
%C%_systemd_system_generators_zfs_mount_generator_LDFLAGS = -pthread
CPPCHECKTARGETS += $(systemdgenerator_PROGRAMS)
endif
-9
View File
@@ -1,9 +0,0 @@
##
## Allow any user to run `zpool iostat/status -c smart` in order
## to read basic SMART health statistics for a pool.
##
## CAUTION: Any syntax error introduced here will break sudo.
## Editing with 'visudo' is recommended: visudo -f /etc/sudoers.d/zfs
##
# ALL ALL = (root) NOPASSWD: /usr/sbin/smartctl -a /dev/[hsv]d[a-z0-9]*
@@ -29,6 +29,5 @@
#define _SYS_ARC_OS_H
int param_set_arc_free_target(SYSCTL_HANDLER_ARGS);
int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
#endif
@@ -30,7 +30,6 @@
#include <linux/sched.h>
typedef enum {
RW_DRIVER = 2,
RW_DEFAULT = 4,
RW_NOLOCKDEP = 5
} krw_type_t;
@@ -75,20 +74,35 @@ spl_rw_set_type(krwlock_t *rwp, krw_type_t type)
{
rwp->rw_type = type;
}
static inline void
spl_rw_lockdep_off(void)
{
lockdep_off();
}
static inline void
spl_rw_lockdep_on(void)
{
lockdep_on();
}
static inline void
spl_rw_lockdep_off_maybe(krwlock_t *rwp) \
{ \
if (rwp && rwp->rw_type == RW_NOLOCKDEP) \
lockdep_off(); \
spl_rw_lockdep_off(); \
}
static inline void
spl_rw_lockdep_on_maybe(krwlock_t *rwp) \
{ \
if (rwp && rwp->rw_type == RW_NOLOCKDEP) \
lockdep_on(); \
spl_rw_lockdep_on(); \
}
#else /* CONFIG_LOCKDEP */
#define spl_rw_set_type(rwp, type)
#define spl_rw_lockdep_off()
#define spl_rw_lockdep_on()
#define spl_rw_lockdep_off_maybe(rwp)
#define spl_rw_lockdep_on_maybe(rwp)
#endif /* CONFIG_LOCKDEP */
@@ -117,6 +131,56 @@ RW_READ_HELD(krwlock_t *rwp)
* will be correctly located in the users code which is important
* for the built in kernel lock analysis tools
*/
#define spl_rw_tryenter_impl(rwp, rw) /* CSTYLED */ \
({ \
int _rc_ = 0; \
\
switch (rw) { \
case RW_READER: \
_rc_ = down_read_trylock(SEM(rwp)); \
break; \
case RW_WRITER: \
if ((_rc_ = down_write_trylock(SEM(rwp)))) \
spl_rw_set_owner(rwp); \
break; \
default: \
VERIFY(0); \
} \
_rc_; \
})
#define spl_rw_enter_impl(rwp, rw) /* CSTYLED */ \
({ \
switch (rw) { \
case RW_READER: \
down_read(SEM(rwp)); \
break; \
case RW_WRITER: \
down_write(SEM(rwp)); \
spl_rw_set_owner(rwp); \
break; \
default: \
VERIFY(0); \
} \
})
#define spl_rw_exit_impl(rwp) /* CSTYLED */ \
({ \
if (RW_WRITE_HELD(rwp)) { \
spl_rw_clear_owner(rwp); \
up_write(SEM(rwp)); \
} else { \
ASSERT(RW_READ_HELD(rwp)); \
up_read(SEM(rwp)); \
} \
})
#define spl_rw_downgrade_impl(rwp) /* CSTYLED */ \
({ \
spl_rw_clear_owner(rwp); \
downgrade_write(SEM(rwp)); \
})
#define rw_init(rwp, name, type, arg) /* CSTYLED */ \
({ \
static struct lock_class_key __key; \
@@ -140,60 +204,60 @@ RW_READ_HELD(krwlock_t *rwp)
#define rw_tryenter(rwp, rw) /* CSTYLED */ \
({ \
int _rc_ = 0; \
\
spl_rw_lockdep_off_maybe(rwp); \
switch (rw) { \
case RW_READER: \
_rc_ = down_read_trylock(SEM(rwp)); \
break; \
case RW_WRITER: \
if ((_rc_ = down_write_trylock(SEM(rwp)))) \
spl_rw_set_owner(rwp); \
break; \
default: \
VERIFY(0); \
} \
int _rc_ = spl_rw_tryenter_impl(rwp, rw); \
spl_rw_lockdep_on_maybe(rwp); \
_rc_; \
})
#define rw_tryenter_nolockdep(rwp, rw) /* CSTYLED */ \
({ \
spl_rw_lockdep_off(); \
int _rc_ = spl_rw_tryenter_impl(rwp, rw); \
spl_rw_lockdep_on(); \
_rc_; \
})
#define rw_enter(rwp, rw) /* CSTYLED */ \
({ \
spl_rw_lockdep_off_maybe(rwp); \
switch (rw) { \
case RW_READER: \
down_read(SEM(rwp)); \
break; \
case RW_WRITER: \
down_write(SEM(rwp)); \
spl_rw_set_owner(rwp); \
break; \
default: \
VERIFY(0); \
} \
spl_rw_enter_impl(rwp, rw); \
spl_rw_lockdep_on_maybe(rwp); \
})
#define rw_enter_nolockdep(rwp, rw) /* CSTYLED */ \
({ \
spl_rw_lockdep_off(); \
spl_rw_enter_impl(rwp, rw); \
spl_rw_lockdep_on(); \
})
#define rw_exit(rwp) /* CSTYLED */ \
({ \
spl_rw_lockdep_off_maybe(rwp); \
if (RW_WRITE_HELD(rwp)) { \
spl_rw_clear_owner(rwp); \
up_write(SEM(rwp)); \
} else { \
ASSERT(RW_READ_HELD(rwp)); \
up_read(SEM(rwp)); \
} \
spl_rw_exit_impl(rwp); \
spl_rw_lockdep_on_maybe(rwp); \
})
#define rw_exit_nolockdep(rwp) /* CSTYLED */ \
({ \
spl_rw_lockdep_off(); \
spl_rw_exit_impl(rwp); \
spl_rw_lockdep_on(); \
})
#define rw_downgrade(rwp) /* CSTYLED */ \
({ \
spl_rw_lockdep_off_maybe(rwp); \
spl_rw_clear_owner(rwp); \
downgrade_write(SEM(rwp)); \
spl_rw_downgrade_impl(rwp); \
spl_rw_lockdep_on_maybe(rwp); \
})
#define rw_downgrade_nolockdep(rwp) /* CSTYLED */ \
({ \
spl_rw_lockdep_off(); \
spl_rw_downgrade_impl(rwp); \
spl_rw_lockdep_on(); \
})
#endif /* _SPL_RWLOCK_H */
+1 -2
View File
@@ -95,8 +95,7 @@ typedef void arc_prune_func_t(uint64_t bytes, void *priv);
extern uint_t zfs_arc_average_blocksize;
extern int l2arc_exclude_special;
/* generic arc_done_func_t's which you can use */
arc_read_done_func_t arc_bcopy_func;
/* generic arc_done_func_t which can be used */
arc_read_done_func_t arc_getbuf_func;
/* generic arc_prune_func_t wrapper for callbacks */
+4 -1
View File
@@ -832,6 +832,8 @@ typedef struct arc_stats {
* due to ARC_FLAG_UNCACHED being set.
*/
kstat_named_t arcstat_uncached_evictable_metadata;
/* Number of L2ARC devices currently attached across all pools. */
kstat_named_t arcstat_l2_ndev;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
/*
@@ -1103,7 +1105,7 @@ extern arc_sums_t arc_sums;
extern hrtime_t arc_growtime;
extern boolean_t arc_warm;
extern uint_t arc_grow_retry;
extern uint_t arc_no_grow_shift;
extern uint_t zfs_arc_no_grow_shift;
extern uint_t arc_shrink_shift;
extern kmutex_t arc_prune_mtx;
extern list_t arc_prune_list;
@@ -1134,6 +1136,7 @@ extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
extern int param_set_l2arc_dwpd_limit(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_no_grow_shift(ZFS_MODULE_PARAM_ARGS);
extern void l2arc_dwpd_bump_reset(void);
/* used in zdb.c */
+14
View File
@@ -363,6 +363,7 @@ typedef enum {
/* Small enough to not hog a whole line of printout in zpool(8). */
#define ZPROP_MAX_COMMENT 32
#define ZPROP_BOOLEAN_NA 2
#define ZPROP_BOOLEAN_INHERIT 2
#define ZPROP_VALUE "value"
#define ZPROP_SOURCE "source"
@@ -476,6 +477,8 @@ typedef enum {
VDEV_PROP_SCHEDULER,
VDEV_PROP_FDOMAIN,
VDEV_PROP_FGROUP,
VDEV_PROP_ALLOC_BIAS,
VDEV_PROP_ROTATIONAL,
VDEV_NUM_PROPS
} vdev_prop_t;
@@ -491,6 +494,16 @@ typedef enum {
VDEV_SCHEDULER_OFF
} vdev_scheduler_type_t;
/*
* Allocation bias for top-level vdevs (alloc_bias property).
*/
typedef enum vdev_alloc_bias {
VDEV_BIAS_NONE,
VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
} vdev_alloc_bias_t;
/*
* Dataset property functions shared between libzfs and kernel.
*/
@@ -919,6 +932,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_VDEV_ROTATIONAL "rotational"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
#define ZPOOL_CONFIG_SPARES "spares"
@@ -330,7 +330,7 @@ struct metaslab_group {
*
* As the space map grows (as a result of the appends) it will
* eventually become space-inefficient. When the metaslab's in-core
* free tree is zfs_condense_pct/100 times the size of the minimal
* free tree is zfs_metaslab_condense_pct/100 times the size of the minimal
* on-disk representation, we rewrite it in its minimized form. If a
* metaslab needs to condense then we must set the ms_condensing flag to
* ensure that allocations are not performed on the metaslab that is
+1 -8
View File
@@ -155,14 +155,6 @@ struct vdev_queue {
kmutex_t vq_lock;
};
typedef enum vdev_alloc_bias {
VDEV_BIAS_NONE,
VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
} vdev_alloc_bias_t;
/*
* On-disk indirect vdev state.
*
@@ -600,6 +592,7 @@ extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern int vdev_load(vdev_t *vd);
extern int vdev_dtl_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_dispatch(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
+151 -87
View File
@@ -24,6 +24,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2026, TrueNAS.
*/
#ifndef _SYS_ZAP_H
@@ -121,13 +122,13 @@ typedef enum zap_flags {
/*
* Create a new zapobj with no attributes and return its object number.
*/
uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
uint64_t zap_create(objset_t *os, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
uint64_t zap_create_norm_dnsize(objset_t *os, int normflags,
dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
@@ -137,11 +138,22 @@ uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
int dnodesize, dmu_tx_t *tx);
/*
* Create a zap object and return a pointer to the newly allocated dnode via
* the allocated_dnode argument. The returned dnode will be held and the
* caller is responsible for releasing the hold by calling dnode_rele().
*/
uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, int dnodesize,
dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx);
/*
* Create a new zapobj with no attributes, and add an entry to an existing
* zapobj with the given name as key and the object number of the new zapobj as
* the value. Returns the object number of the new zapobj.
*/
uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
@@ -157,20 +169,21 @@ void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags,
* Create a new zapobj with no attributes from the given (unallocated)
* object number.
*/
int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
int zap_create_claim_norm(objset_t *ds, uint64_t obj,
int zap_create_claim_norm(objset_t *os, uint64_t obj,
int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj,
int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
/*
* The zapobj passed in must be a valid ZAP object for all of the
* following routines.
* All operations on a zapobj take either the the objset/objectid pair
* that "names" the object, or an existing dnode_t for the object. The
* zapobj passed in must be a valid ZAP object.
*/
/*
@@ -178,7 +191,7 @@ int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
*
* Frees the object number using dmu_object_free.
*/
int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
/*
* Manipulate attributes.
@@ -207,21 +220,32 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
* fit will be transferred to 'buf'. If the entire attribute was not
* transferred, the call will return EOVERFLOW.
*/
int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
int zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
/*
* If rn_len is nonzero, realname will be set to the name of the found
* entry (which may be different from the requested name if matchtype is
* not MT_EXACT).
* not zero).
*
* If normalization_conflictp is not NULL, it will be set if there is
* another name with the same case/unicode normalized form.
*/
int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp);
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp);
/*
* The _uint64 variants take an array of uint64_t as the key. The ZAP must
* be created with ZAP_FLAG_UINT64_KEY.
*/
int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -229,20 +253,31 @@ int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
uint64_t *actual_num_integers);
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
/*
* Lookup the attribute with the given name. Returns ENOENT if it does not
* exist, 0 if it does. This is like zap_lookup(), but may be more efficient.
*/
int zap_contains(objset_t *os, uint64_t zapobj, const char *name);
int zap_contains_by_dnode(dnode_t *dn, const char *name);
/*
* Prefetch the blocks within the ZAP where the given key is stored. The
* prefetch IO will occure in the background.
*/
int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
int zap_prefetch_object(objset_t *os, uint64_t zapobj);
/* Prefetch by uint64_t[] key. */
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints);
int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints);
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp);
/*
* Prefetch the entire ZAP object. Unlike zap_prefetch(), will block until
* the entire object is loaded into the ARC.
*/
int zap_prefetch_object(objset_t *os, uint64_t zapobj);
/*
* Create an attribute with the given name and value.
@@ -250,13 +285,15 @@ int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
* If an attribute with the given name already exists, the call will
* fail and return EEXIST.
*/
int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
int zap_add(objset_t *os, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_by_dnode(dnode_t *dn, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
/* Add by uint64_t[] key. */
int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -271,8 +308,12 @@ int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
* existing attribute's integer size, in which case the attribute's
* integer size will be updated to the new value.
*/
int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int zap_update(objset_t *os, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
int zap_update_by_dnode(dnode_t *dn, const char *name, int integer_size,
uint64_t num_integers, const void *val, dmu_tx_t *tx);
/* Update by uint64_t[] key. */
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
@@ -287,8 +328,12 @@ int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
* If the requested attribute does not exist, the call will fail and
* return ENOENT.
*/
int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
int zap_length(objset_t *os, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
int zap_length_by_dnode(dnode_t *dn, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
/* Attribute length by uint64_t[] key. */
int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers);
int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -300,10 +345,12 @@ int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
* If the specified attribute does not exist, the call will fail and
* return ENOENT.
*/
int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx);
int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx);
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx);
/* Remove by uint64_t[] key. */
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx);
int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
@@ -313,9 +360,19 @@ int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
* Returns (in *count) the number of attributes in the specified zap
* object.
*/
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count);
int zap_count_by_dnode(dnode_t *dn, uint64_t *count);
/*
* Lookup an existing uint64 value, add the delta value to it, and store
* update it with the new value. If the new value is 0, removes the key
* entirely.
*/
int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
dmu_tx_t *tx);
int zap_increment_by_dnode(dnode_t *dn, const char *name, int64_t delta,
dmu_tx_t *tx);
/*
* Returns (in name) the name of the entry whose (value & mask)
* (za_first_integer) is value, or ENOENT if not found. The string
@@ -324,21 +381,8 @@ int zap_count_by_dnode(dnode_t *dn, uint64_t *count);
*/
int zap_value_search(objset_t *os, uint64_t zapobj,
uint64_t value, uint64_t mask, char *name, uint64_t namelen);
/*
* Transfer all the entries from fromobj into intoobj. Only works on
* int_size=8 num_integers=1 values. Fails if there are any duplicated
* entries.
*/
int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
/* Same as zap_join, but set the values to 'value'. */
int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
uint64_t value, dmu_tx_t *tx);
/* Same as zap_join, but add together any duplicated entries. */
int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
dmu_tx_t *tx);
int zap_value_search_by_dnode(dnode_t *dn,
uint64_t value, uint64_t mask, char *name, uint64_t namelen);
/*
* Manipulate entries where the name + value are the "same" (the name is
@@ -347,8 +391,10 @@ int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
dmu_tx_t *tx);
int zap_add_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx);
int zap_remove_int_by_dnode(dnode_t *dn, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int_by_dnode(dnode_t *dn, uint64_t value);
/* Here the key is an int and the value is a different int. */
int zap_add_int_key(objset_t *os, uint64_t obj,
@@ -358,22 +404,19 @@ int zap_update_int_key(objset_t *os, uint64_t obj,
int zap_lookup_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t *valuep);
int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
dmu_tx_t *tx);
int zap_add_int_key_by_dnode(dnode_t *dn,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_update_int_key_by_dnode(dnode_t *dn,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int_key_by_dnode(dnode_t *dn,
uint64_t key, uint64_t *valuep);
struct zap;
struct zap_leaf;
typedef struct zap_cursor {
/* This structure is opaque! */
objset_t *zc_objset;
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_zapobj;
uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
boolean_t zc_prefetch;
} zap_cursor_t;
/*
* The interface for listing all the attributes of a zapobj can be
* thought of as cursor moving down a list of the attributes one by
* one. The cookie returned by the zap_cursor_serialize routine is
* persistent across system calls (and across reboot, even).
*/
typedef struct {
int za_integer_length;
@@ -389,9 +432,6 @@ typedef struct {
char za_name[];
} zap_attribute_t;
void zap_init(void);
void zap_fini(void);
/*
* Alloc and free zap_attribute_t.
*/
@@ -399,22 +439,52 @@ zap_attribute_t *zap_attribute_alloc(void);
zap_attribute_t *zap_attribute_long_alloc(void);
void zap_attribute_free(zap_attribute_t *attrp);
/*
* The interface for listing all the attributes of a zapobj can be
* thought of as cursor moving down a list of the attributes one by
* one. The cookie returned by the zap_cursor_serialize routine is
* persistent across system calls (and across reboot, even).
struct zap;
struct zap_leaf;
typedef struct zap_cursor {
/* This structure is opaque! */
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_hash;
uint32_t zc_cd;
boolean_t zc_prefetch;
/*
* Legacy fields to main source compat with Lustre, which accesses
* them directly. Not to be used in new code!
*/
objset_t *zc_objset;
uint64_t zc_zapobj;
} zap_cursor_t;
/*
* Initialize a zap cursor, pointing to the "first" attribute of the
* zapobj. You must _fini the cursor when you are done with it.
* Initialize a zap cursor, pointing to the "first" attribute of the zapobj.
* The entire zapobj will be prefetched. You must call zap_cursor_fini the
* cursor when you are done with it.
*/
void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj);
int zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
int zap_cursor_init_by_dnode(zap_cursor_t *zc, dnode_t *dn);
void zap_cursor_fini(zap_cursor_t *zc);
/*
* Initialize a cursor at the beginning, but request that we not prefetch
* the entire ZAP object.
*/
int zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj);
/*
* Initialize a zap cursor pointing to the position recorded by
* zap_cursor_serialize (in the "serialized" argument). You can also
* use a "serialized" argument of 0 to start at the beginning of the
* zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
* zap_cursor_init(...).)
*/
int zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj, uint64_t serialized);
int zap_cursor_init_serialized_by_dnode(zap_cursor_t *zc, dnode_t *dn,
uint64_t serialized);
/*
* Get the attribute currently pointed to by the cursor. Returns
* ENOENT if at the end of the attributes.
@@ -435,17 +505,6 @@ void zap_cursor_advance(zap_cursor_t *zc);
*/
uint64_t zap_cursor_serialize(zap_cursor_t *zc);
/*
* Initialize a zap cursor pointing to the position recorded by
* zap_cursor_serialize (in the "serialized" argument). You can also
* use a "serialized" argument of 0 to start at the beginning of the
* zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
* zap_cursor_init(...).)
*/
void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
uint64_t zapobj, uint64_t serialized);
#define ZAP_HISTOGRAM_SIZE 10
typedef struct zap_stats {
@@ -535,7 +594,12 @@ typedef struct zap_stats {
* statistics. This interface shouldn't be relied on unless you really
* know what you're doing.
*/
int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs);
int zap_get_stats_by_dnode(dnode_t *dn, zap_stats_t *zs);
/* ZAP subsystem setup/teardown */
void zap_init(void);
void zap_fini(void);
#ifdef __cplusplus
}
+92 -18
View File
@@ -26,6 +26,7 @@
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2024, Klara, Inc.
* Copyright (c) 2026, TrueNAS.
*/
#ifndef _SYS_ZAP_IMPL_H
@@ -33,7 +34,6 @@
#include <sys/zap.h>
#include <sys/zfs_context.h>
#include <sys/avl.h>
#ifdef __cplusplus
extern "C" {
@@ -62,8 +62,9 @@ typedef struct mzap_phys {
uint64_t mz_salt;
uint64_t mz_normflags;
uint64_t mz_pad[5];
mzap_ent_phys_t mz_chunk[1];
/* actually variable size depending on block size */
mzap_ent_phys_t mz_chunk[];
} mzap_phys_t;
typedef struct mzap_ent {
@@ -170,6 +171,9 @@ typedef struct zap {
} zap_u;
} zap_t;
#define zap_f zap_u.zap_fat
#define zap_m zap_u.zap_micro
static inline zap_phys_t *
zap_f_phys(zap_t *zap)
{
@@ -182,6 +186,10 @@ zap_m_phys(zap_t *zap)
return (zap->zap_dbuf->db_data);
}
/*
* zap_name_t carries the original key and whatever we've derived from it
* (normalised form, hash, etc) as we work through completing the operation.
*/
typedef struct zap_name {
zap_t *zn_zap;
int zn_key_intlen;
@@ -196,25 +204,94 @@ typedef struct zap_name {
char zn_normbuf[];
} zap_name_t;
#define zap_f zap_u.zap_fat
#define zap_m zap_u.zap_micro
/*
* Allocate a zap_name_t. The longname flag ensures there is enough room to
* hold a long filename when the 'longname' pool feature is active.
*/
zap_name_t *zap_name_alloc(zap_t *zap, boolean_t longname);
/*
* Allocate a zap_name_t for the given key. zap_name_init_str() will be
* called to normalise the key and initialise the struct.
*/
zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
/*
* Allocate a zap_name_t for a uint64 array key.
*/
zap_name_t *zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints);
/*
* Free a zap_name_t.
*/
void zap_name_free(zap_name_t *zn);
/*
* Initialise an existing zap_name_t with the normalised form of the key,
* computed according to the given matchtype.
*/
int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt);
/*
* Compare 'matchname' with the name represented by the zap_name_t, applying
* the same normalisation method first. Returns true if the normalised forms
* match, false otherwise.
*/
boolean_t zap_match(zap_name_t *zn, const char *matchname);
int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
/*
* Compute and return the 64-bit hash for the name, according to the name
* type and hash flags.
*/
uint64_t zap_hash(zap_name_t *zn);
/*
* Return a zap_t for the given on-disk object, locked and ready for use.
* The zap_t will be allocated and loaded from disk if its not already loaded.
*/
int zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
zap_t **zapp);
void zap_unlockdir(zap_t *zap, const void *tag);
int zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
zap_t **zapp);
/* Unlock and release a zap_t. */
void zap_unlock(zap_t *zap, const void *tag);
/*
* Try to upgrade a zap lock from READER to WRITER. If the upgrade is not
* possible without blocking, returns 0. If the upgrade happened, returns 1.
*/
int zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx);
/*
* Upgrade a zap lock from READER to WRITER. If it can't be upgraded
* immediately it will block.
*/
void zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx);
/* zap_t release function for when associated dbuf is evicted. */
void zap_evict_sync(void *dbu);
zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
/* Misc internal state & config. */
int zap_hashbits(zap_t *zap);
uint32_t zap_maxcd(zap_t *zap);
uint64_t zap_getflags(zap_t *zap);
/* Microzap implementation. */
zap_t *mzap_open(dmu_buf_t *db);
int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
mzap_ent_t *mze_find(zap_name_t *zn, zfs_btree_index_t *idx);
boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash);
void mze_destroy(zap_t *zap);
boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn,
mzap_ent_t *mze, zfs_btree_index_t *idx);
void mzap_addent(zap_name_t *zn, uint64_t value);
void mzap_byteswap(mzap_phys_t *buf, size_t size);
uint64_t zap_get_micro_max_size(spa_t *spa);
#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
/* Fatzap implementation. */
void fzap_byteswap(void *buf, size_t size);
int fzap_count(zap_t *zap, uint64_t *count);
int fzap_lookup(zap_name_t *zn,
@@ -223,20 +300,17 @@ int fzap_lookup(zap_name_t *zn,
uint64_t *actual_num_integers);
void fzap_prefetch(zap_name_t *zn);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, const void *tag, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn,
int integer_size, uint64_t num_integers, const void *val,
const void *tag, dmu_tx_t *tx);
const void *val, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int fzap_length(zap_name_t *zn,
uint64_t *integer_size, uint64_t *num_integers);
int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
void zap_put_leaf(struct zap_leaf *l);
int fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx);
int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx);
void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
#ifdef __cplusplus
+6 -7
View File
@@ -139,12 +139,12 @@ enum zio_stage {
ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */
ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */
ZIO_STAGE_DDT_READ_START = 1 << 9, /* R----- */
ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R----- */
ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W---- */
ZIO_STAGE_DDT_FREE = 1 << 12, /* --F--- */
ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */
ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */
ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */
ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */
ZIO_STAGE_BRT_FREE = 1 << 13, /* --F--- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */
ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */
@@ -259,8 +259,7 @@ enum zio_stage {
ZIO_STAGE_DVA_FREE)
#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
(ZIO_FREE_PIPELINE | \
ZIO_STAGE_ISSUE_ASYNC | \
ZIO_STAGE_DDT_FREE)
@@ -63,7 +63,3 @@ libspl_la_LIBADD = \
libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
if BUILD_FREEBSD
libspl_assert_la_LIBADD += -lpthread
endif
+1 -3
View File
@@ -76,7 +76,7 @@ libzfs_la_LIBADD = \
libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
libzfs_la_LDFLAGS = -pthread
libzfs_la_LDFLAGS = -version-info 7:0:0
if !ASAN_ENABLED
libzfs_la_LDFLAGS += -Wl,-z,defs
@@ -86,8 +86,6 @@ if BUILD_FREEBSD
libzfs_la_LIBADD += -lutil -lgeom
endif
libzfs_la_LDFLAGS += -version-info 7:0:0
pkgconfig_DATA += %D%/libzfs.pc
dist_noinst_DATA += %D%/libzfs.abi %D%/libzfs.suppr
+7 -2
View File
@@ -2553,7 +2553,7 @@
<typedef-decl name='__uint32_t' type-id='f0981eeb' id='62f1140c'/>
<typedef-decl name='__uint64_t' type-id='7359adad' id='8910171f'/>
<typedef-decl name='size_t' type-id='7359adad' id='b59d7dce'/>
<class-decl name='libzfs_handle' size-in-bits='18432' is-struct='yes' visibility='default' id='c8a9d9d8'>
<class-decl name='libzfs_handle' size-in-bits='18496' is-struct='yes' visibility='default' id='c8a9d9d8'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='libzfs_error' type-id='95e97e5e' visibility='default'/>
</data-member>
@@ -2605,6 +2605,9 @@
<data-member access='public' layout-offset-in-bits='18112'>
<var-decl name='zh_mnttab' type-id='f20fbd51' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='18432'>
<var-decl name='zh_mnttab_cache_enabled' type-id='c19b74c3' visibility='default'/>
</data-member>
</class-decl>
<class-decl name='zfs_handle' size-in-bits='4928' is-struct='yes' visibility='default' id='f6ee4445'>
<data-member access='public' layout-offset-in-bits='0'>
@@ -6412,7 +6415,9 @@
<enumerator name='VDEV_PROP_SCHEDULER' value='55'/>
<enumerator name='VDEV_PROP_FDOMAIN' value='56'/>
<enumerator name='VDEV_PROP_FGROUP' value='57'/>
<enumerator name='VDEV_NUM_PROPS' value='58'/>
<enumerator name='VDEV_PROP_ALLOC_BIAS' value='58'/>
<enumerator name='VDEV_PROP_ROTATIONAL' value='59'/>
<enumerator name='VDEV_NUM_PROPS' value='60'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
@@ -177,6 +177,7 @@ changelist_postfix(prop_changelist_t *clp)
char shareopts[ZFS_MAXPROPLEN];
boolean_t commit_smb_shares = B_FALSE;
boolean_t commit_nfs_shares = B_FALSE;
int rc = 0;
/*
* If CL_GATHER_DONT_UNMOUNT is set, it means we don't want to (un)mount
@@ -266,7 +267,7 @@ changelist_postfix(prop_changelist_t *clp)
const enum sa_protocol nfs[] =
{SA_PROTOCOL_NFS, SA_NO_PROTOCOL};
if (sharenfs && mounted) {
zfs_share(cn->cn_handle, nfs);
rc = zfs_share(cn->cn_handle, nfs);
commit_nfs_shares = B_TRUE;
} else if (cn->cn_shared || clp->cl_waslegacy) {
zfs_unshare(cn->cn_handle, NULL, nfs);
@@ -275,7 +276,7 @@ changelist_postfix(prop_changelist_t *clp)
const enum sa_protocol smb[] =
{SA_PROTOCOL_SMB, SA_NO_PROTOCOL};
if (sharesmb && mounted) {
zfs_share(cn->cn_handle, smb);
rc = zfs_share(cn->cn_handle, smb);
commit_smb_shares = B_TRUE;
} else if (cn->cn_shared || clp->cl_waslegacy) {
zfs_unshare(cn->cn_handle, NULL, smb);
@@ -291,7 +292,15 @@ changelist_postfix(prop_changelist_t *clp)
*p++ = SA_NO_PROTOCOL;
zfs_commit_shares(proto);
return (0);
/*
* It's possible rc != 0 since we set a mountpoint or option while
* SMB/NFS was not running. This is fine, and we should not return
* an error up the stack.
*
* At this point we only want to report mountpoint/shareops parsing
* errors.
*/
return (rc == SA_SYNTAX_ERR ? rc : 0);
}
/*
+60 -1
View File
@@ -2031,12 +2031,21 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
return (0);
}
/*
* Export the pool from the system. Setting force overrides the
* active-shared-spare check. The caller must unmount all datasets
* in the pool first.
*/
int
zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str)
{
return (zpool_export_common(zhp, force, B_FALSE, log_str));
}
/*
* Force-export the pool: bypasses the active-shared-spare check, and skips
* writing the exported-state labels and updating the cachefile.
*/
int
zpool_export_force(zpool_handle_t *zhp, const char *log_str)
{
@@ -2574,6 +2583,10 @@ xlate_init_err(int err)
return (err);
}
/*
* Start (or cancel/suspend/uninit) the initialize operation on every
* leaf vdev of the pool.
*/
int
zpool_initialize_one(zpool_handle_t *zhp, void *data)
{
@@ -2685,6 +2698,10 @@ zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
return (err == 0 ? 0 : -1);
}
/*
* Start (or cancel/suspend/uninit) the initialize operation on the listed
* vdevs. Returns once the new state is committed.
*/
int
zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds)
@@ -2692,6 +2709,9 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE));
}
/*
* Like zpool_initialize(), but waits for each listed vdev to finish.
*/
int
zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds)
@@ -2746,6 +2766,10 @@ zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
}
}
/*
* Start (or cancel/suspend) the trim operation on every leaf vdev of
* the pool.
*/
int
zpool_trim_one(zpool_handle_t *zhp, void *data)
{
@@ -3393,6 +3417,11 @@ __zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
return (ret);
}
/*
* Look up a vdev in the pool by path, name, or guid. Returns the
* vdev's configuration nvlist, or NULL on no match. Also, fills
* in avail_spare, l2cache, and log if they are non-NULL.
*/
nvlist_t *
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
@@ -4637,7 +4666,10 @@ zpool_reopen_one(zpool_handle_t *zhp, void *data)
return (0);
}
/* call into libzfs_core to execute the sync IOCTL per pool */
/*
* Block until every buffered write for the pool has reached the
* underlying disks.
*/
int
zpool_sync_one(zpool_handle_t *zhp, void *data)
{
@@ -4913,6 +4945,10 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
return (0);
}
/*
* Format the program name and its command-line arguments into a single
* space-separated string.
*/
void
zfs_save_arguments(int argc, char **argv, char *string, int len)
{
@@ -4925,6 +4961,10 @@ zfs_save_arguments(int argc, char **argv, char *string, int len)
}
}
/*
* Append a message to the pool's command-history log, retrievable via
* "zpool history".
*/
int
zpool_log_history(libzfs_handle_t *hdl, const char *message)
{
@@ -5220,6 +5260,11 @@ zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
free(mntpnt);
}
/*
* Translate a (dataset object id, file object id) pair into a readable
* path. If the dataset is mounted the result is an absolute filesystem
* path; otherwise it is `dataset:path`.
*/
void
zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
char *pathname, size_t len)
@@ -5227,6 +5272,10 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE);
}
/*
* Translate a (dataset object id, file object id) pair into a
* `dataset:path` string.
*/
void
zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
char *pathname, size_t len)
@@ -5281,6 +5330,10 @@ zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity,
return (error);
}
/*
* Store a boot configuration map in the bootenv area of each leaf
* vdev's labels.
*/
int
zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap)
{
@@ -5294,6 +5347,9 @@ zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap)
return (error);
}
/*
* Read the boot configuration map from each leaf vdev's bootenv area.
*/
int
zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp)
{
@@ -5741,6 +5797,9 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
return (ENOENT);
if (prop == VDEV_PROP_SIT_OUT)
return (ENOENT);
/* Only valid for top-level vdevs */
if (prop == VDEV_PROP_ALLOC_BIAS)
return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
(const char **)&strval) != 0)
@@ -64,6 +64,10 @@ sa_enable_share(const char *zfsname, const char *mountpoint,
{
VALIDATE_PROTOCOL(protocol, SA_INVALID_PROTOCOL);
int error = sa_validate_shareopts(shareopts, protocol);
if (error != SA_OK)
return (error);
const struct sa_share_impl args =
init_share(zfsname, mountpoint, shareopts);
return (fstypes[protocol]->enable_share(&args));
@@ -111,6 +115,10 @@ sa_validate_shareopts(const char *options, enum sa_protocol protocol)
{
VALIDATE_PROTOCOL(protocol, SA_INVALID_PROTOCOL);
/* error out on invalid characters */
if (strpbrk(options, "\a\b\f\n\r") != NULL)
return (SA_SYNTAX_ERR);
return (fstypes[protocol]->validate_shareopts(options));
}
@@ -33,7 +33,7 @@ libzfs_core_la_LIBADD = \
libzfs_core_la_LIBADD += $(LTLIBINTL)
libzfs_core_la_LDFLAGS = -pthread
libzfs_core_la_LDFLAGS = -version-info 3:0:0
if !ASAN_ENABLED
libzfs_core_la_LDFLAGS += -Wl,-z,defs
@@ -43,8 +43,6 @@ if BUILD_FREEBSD
libzfs_core_la_LIBADD += -lutil -lgeom
endif
libzfs_core_la_LDFLAGS += -version-info 3:0:0
pkgconfig_DATA += %D%/libzfs_core.pc
dist_noinst_DATA += %D%/libzfs_core.abi %D%/libzfs_core.suppr
+3 -3
View File
@@ -166,6 +166,8 @@ nodist_libzpool_la_SOURCES = \
module/zfs/vdev_root.c \
module/zfs/vdev_trim.c \
module/zfs/zap.c \
module/zfs/zap_fat.c \
module/zfs/zap_impl.c \
module/zfs/zap_leaf.c \
module/zfs/zap_micro.c \
module/zfs/zcp.c \
@@ -212,7 +214,7 @@ libzpool_la_LIBADD = \
libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -lm
libzpool_la_LDFLAGS = -pthread
libzpool_la_LDFLAGS = -version-info 7:0:0
if !ASAN_ENABLED
libzpool_la_LDFLAGS += -Wl,-z,defs
@@ -222,8 +224,6 @@ if BUILD_FREEBSD
libzpool_la_LIBADD += -lgeom
endif
libzpool_la_LDFLAGS += -version-info 7:0:0
if TARGET_CPU_POWERPC
module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.$(OBJEXT) : CFLAGS += -maltivec
module/zfs/libzpool_la-vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec
+62 -6
View File
@@ -4,6 +4,7 @@
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
.\" Copyright (c) 2026, Mateusz Piotrowski <0mp@FreeBSD.org>
.\"
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
@@ -18,7 +19,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd September 15, 2025
.Dd May 8, 2026
.Dt ZFS 4
.Os
.
@@ -389,6 +390,18 @@ this is
or
.Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 .
.
.It Sy metaslab_df_alloc_threshold Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64
Minimum size which forces the dynamic allocator to change its allocation
strategy.
Once the space map cannot satisfy an allocation of this size, it switches to a
more aggressive strategy (searching by size rather than offset).
.
.It Sy metaslab_df_free_pct Ns = Ns Sy 4 Ns % Pq uint
The minimum free space, in percent, which must be available in a space map to
continue allocations in a first-fit fashion.
Once free space drops below this level, allocations switch to a best-fit
strategy.
.
.It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int
If not searching forward (due to
.Sy metaslab_df_max_search , metaslab_df_free_pct ,
@@ -445,6 +458,32 @@ This improves performance, especially when there are many metaslabs per vdev
and the allocation can't actually be satisfied
(so we would otherwise iterate all metaslabs).
.
.It Sy zfs_metaslab_sm_blksz_no_log Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq int
Block size for the metaslab space maps in pools where the
.Sy log_spacemap
feature is disabled.
Multiple metaslabs are modified per transaction group, so a smaller block size
lets more, scattered I/O operations be issued.
Must be a power of 2 greater than
.Sy 4096 .
This parameter can only be set at module load time.
.
.It Sy zfs_metaslab_sm_blksz_with_log Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int
Block size for the metaslab space maps in pools where the
.Sy log_spacemap
feature is enabled.
Changes are batched in the per-pool log spacemap and flushed to each metaslab's
space map only occasionally, so a larger block size is more efficient.
Must be a power of 2 greater than
.Sy 4096 .
This parameter can only be set at module load time.
.
.It Sy zfs_metaslab_condense_pct Ns = Ns Sy 200 Ns % Pq uint
Condense an on-disk space map when its size exceeds this percentage of
the in-memory representation.
The minimum is
.Sy 100 .
.
.It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint
When a vdev is added, target this number of metaslabs per top-level vdev.
.
@@ -768,9 +807,15 @@ See also
which serves a similar purpose but has a higher priority if nonzero.
.
.It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64
Percentage of ARC dnodes to try to scan in response to demand for non-metadata
when the number of bytes consumed by dnodes exceeds
.Sy zfs_arc_dnode_limit .
Percentage used to size dnode prune requests.
The request size is the larger of two values:
.Sy zfs_arc_dnode_reduce_percent
applied to the dnode count above
.Sy zfs_arc_dnode_limit ,
or
.Sy zfs_arc_dnode_reduce_percent
applied to the total dnode count
when non-evictable metadata exceeds 3/4 of the metadata target.
.
.It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint
The ARC's buffer hash table is sized based on the assumption of an average
@@ -911,6 +956,19 @@ but that was not proven to be useful.
Number of missing top-level vdevs which will be allowed during
pool import (only in read-only mode).
.
.It Sy zfs_max_missing_tvds_cachefile Ns = Ns Sy 2 Pq u64
Number of missing top-level vdevs tolerated when importing a pool
from a cachefile, before the trusted config is read from the MOS.
A cachefile can fall out of sync with the on-disk config after a
device removal that did not rewrite the cachefile, so the default
of 2 still lets the import reach a copy of the MOS.
.
.It Sy zfs_max_missing_tvds_scan Ns = Ns Sy 0 Pq u64
Number of missing top-level vdevs tolerated when importing a pool
by scanning device paths, before the trusted config is read from
the MOS.
Defaults to 0 because a scan should detect every present device.
.
.It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64
Maximum size in bytes allowed to be passed as
.Sy zc_nvlist_src_size
@@ -948,8 +1006,6 @@ equivalent to the greater of the number of online CPUs and
If less than
.Sy arc_c No >> Sy zfs_arc_no_grow_shift
free memory is available, the ARC is not allowed to grow.
This parameter is
.Fx Ns -specific .
.
.It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int
The ARC size is considered to be overflowing if it exceeds the current
+21 -1
View File
@@ -142,6 +142,8 @@ See
.Xr zpool-attach 8 .
.It Sy trim_support
Indicates if a leaf device supports trim operations.
.It Sy rotational
Indicates whether the device backing this vdev uses rotating media.
.El
.Pp
The following native properties can be used to change the behavior of a vdev.
@@ -183,9 +185,12 @@ output.
A text comment up to 8192 characters long
.It Sy bootsize
The amount of space to reserve for the EFI system partition
.It Sy failfast
.It Sy failfast Ns = Ns Sy inherit Ns | Ns Sy on Ns | Ns Sy off
If this device should propagate BIO errors back to ZFS, used to disable
failfast.
.Sy inherit
causes the vdev to adopt the behavior of its parent vdev,
recursively up the tree.
.It Sy sit_out
Only valid for
.Sy RAIDZ
@@ -218,6 +223,21 @@ If this device should perform new allocations, used to disable a device
when it is scheduled for later removal.
See
.Xr zpool-remove 8 .
.It Sy alloc_bias Ns = Ns Sy none Ns | Ns Sy log Ns | Ns Sy special Ns | Ns Sy dedup
Controls the allocation class for a top-level vdev.
Changes take effect after an export and import of the pool.
Changing to/from log is not implemented, since it may lead to data loss in
case of the log device failure.
Setting to
.Sy special
and
.Sy dedup
requires
.Sy feature@allocation_classes
to be enabled.
At least one top-level vdev must remain in the normal
.Pq Sy none
class.
.It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off
Controls how I/O requests are added to the vdev queue when reading or
writing to this vdev.
+6 -1
View File
@@ -284,10 +284,15 @@ Decode and display block from an embedded block pointer specified by the
arguments.
.It Fl f , -file-layout
Display the file layout of an object for the disks of a raidz vdev.
Numeric values in the disply are hexadecimal.
With
.Fl H ,
the output is in scripted mode for easy parsing, with all values
being presented as 512 byte blocks.
being presented as 512 byte blocks in decimal; with
.Fl v ,
the block type (parity or data) is displayed; with
.Fl vv ,
the offset into the file for each block is also printed.
Only a single top-level raidz vdev is supported.
.It Fl h , -history
Display pool history similar to
+30 -1
View File
@@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd November 8, 2023
.Dd May 9, 2026
.Dt ZPOOL-ATTACH 8
.Os
.
@@ -132,6 +132,35 @@ Waits until
has finished resilvering or expanding before returning.
.El
.
.Sh EXAMPLES
.\" Example 1 is example 5 from zpool.8.
.\" Make sure to update them bidirectionally
.Ss Example 1 : No Making a non-mirrored ZFS Storage Pool mirrored
The following command converts an existing single device
.Ar sda
into a mirror by attaching a second device to it,
.Ar sdb .
.Dl # Nm zpool Cm attach Ar tank Pa sda sdb
.
.Ss Example 2 : No Expanding a RAID-Z vdev with an additional disk
The following command adds
.Ar sdg
to the existing
.Ar raidz2-0
vdev in
.Ar tank ,
turning a 6-wide RAID-Z2 into a 7-wide RAID-Z2:
.Dl # Nm zpool Cm attach Ar tank raidz2-0 Pa sdg
Progress is reported by
.Nm zpool Cm status .
The operation requires the
.Sy raidz_expansion
pool feature, and
.Ar sdg
must be at least as large as the smallest existing disk in the vdev.
Old blocks keep their original data-to-parity ratio; only blocks written
after the expansion use the new ratio.
.
.Sh SEE ALSO
.Xr zpool-add 8 ,
.Xr zpool-detach 8 ,
+5 -5
View File
@@ -458,12 +458,12 @@ ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W----
ZIO_STAGE_NOP_WRITE:0x00000100:-W----
ZIO_STAGE_BRT_FREE:0x00000200:--F---
ZIO_STAGE_DDT_READ_START:0x00000200:R-----
ZIO_STAGE_DDT_READ_DONE:0x00000400:R-----
ZIO_STAGE_DDT_WRITE:0x00000800:-W----
ZIO_STAGE_DDT_FREE:0x00001000:--F---
ZIO_STAGE_DDT_READ_START:0x00000400:R-----
ZIO_STAGE_DDT_READ_DONE:0x00000800:R-----
ZIO_STAGE_DDT_WRITE:0x00001000:-W----
ZIO_STAGE_DDT_FREE:0x00002000:--F---
ZIO_STAGE_BRT_FREE:0x00002000:--F---
ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC--
ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC--
+1 -4
View File
@@ -109,10 +109,7 @@ environment variable set.
If a script requires the use of a privileged command, like
.Xr smartctl 8 ,
then it's recommended you allow the user access to it in
.Pa /etc/sudoers
or add the user to the
.Pa /etc/sudoers.d/zfs
file.
.Pa /etc/sudoers .
.Pp
If
.Fl c
+1
View File
@@ -245,6 +245,7 @@ Invalid command line options were specified.
.
.Sh EXAMPLES
.\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8.
.\" Example 5 is shared with zpool-attach.8.
.\" Examples 6, 14 are shared with zpool-add.8.
.\" Examples 7, 16 are shared with zpool-list.8.
.\" Examples 8 are shared with zpool-destroy.8.
+11 -2
View File
@@ -4,9 +4,11 @@
ZFS_MODULE_CFLAGS += -std=gnu11 -Wno-declaration-after-statement
ZFS_MODULE_CFLAGS += -Wmissing-prototypes
ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@
ZFS_MODULE_CFLAGS += @KERNEL_NO_FORMAT_ZERO_LENGTH@
ifneq ($(KBUILD_EXTMOD),)
ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@
ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
zfs_include = @abs_top_srcdir@/include
icp_include = @abs_srcdir@/icp/include
zstd_include = @abs_srcdir@/zstd/include
@@ -16,6 +18,12 @@ ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include
src = @abs_srcdir@
obj = @abs_builddir@
else
ifeq ($(CONFIG_ZFS_DEBUG),y)
ZFS_MODULE_CFLAGS += -Werror
ZFS_MODULE_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
else
ZFS_MODULE_CPPFLAGS += -UDEBUG -DNDEBUG
endif
zfs_include = $(srctree)/include/zfs
icp_include = $(src)/icp/include
zstd_include = $(src)/zstd/include
@@ -28,7 +36,6 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
ZFS_MODULE_CFLAGS += -I$(zfs_include)
ZFS_MODULE_CPPFLAGS += -D_KERNEL
ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
# KASAN enables -Werror=frame-larger-than=1024, which
# breaks oh so many parts of our build.
@@ -408,6 +415,8 @@ ZFS_OBJS := \
vdev_root.o \
vdev_trim.o \
zap.o \
zap_fat.o \
zap_impl.o \
zap_leaf.o \
zap_micro.o \
zcp.o \
+9 -1
View File
@@ -65,6 +65,12 @@ CFLAGS+= -DZFS_DEBUG -g
CFLAGS += -DNDEBUG
.endif
.for _SAN in KASAN KMSAN KUBSAN
.if defined(WITH_${_SAN}) && ${WITH_${_SAN}} == "true"
KERN_OPTS_EXTRA+= ${_SAN}
.endif
.endfor
.if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
CFLAGS+= -fprofile-arcs -ftest-coverage
.endif
@@ -345,6 +351,8 @@ SRCS+= abd.c \
vdev_root.c \
vdev_trim.c \
zap.c \
zap_fat.c \
zap_impl.c \
zap_leaf.c \
zap_micro.c \
zcp.c \
@@ -475,8 +483,8 @@ CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
CFLAGS.zap_impl.c= -Wno-cast-qual
CFLAGS.zap_leaf.c= -Wno-cast-qual
CFLAGS.zap_micro.c= -Wno-cast-qual
CFLAGS.zcp.c= -Wno-cast-qual
CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
+37 -12
View File
@@ -135,7 +135,8 @@
#define NVP_SIZE_CALC(name_len, data_len) \
(NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
static int i_get_value_size(data_type_t type, const void *data, uint_t nelem,
size_t max_size);
static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
uint_t nelem, const void *data);
@@ -810,8 +811,10 @@ i_validate_nvpair(nvpair_t *nvp)
* verify nvp_type, nvp_value_elem, and also possibly
* verify string values and get the value size.
*/
size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
size1 = nvp->nvp_size - NVP_VALOFF(nvp);
size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp),
size1);
if (size2 < 0 || size1 != NV_ALIGN(size2))
return (EFAULT);
@@ -1002,12 +1005,21 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
* DATA_TYPE_STRING and
* DATA_TYPE_STRING_ARRAY
* Is data == NULL then the size of the string(s) is excluded.
*
* If 'max_size' is non-zero, then don't look beyond 'max_size' number of
* bytes when calculating a value size. Note that 'max_size' should include
* the NULL terminator byte when calculating string size. If 'max_size' is 0,
* it is ignored.
*/
static int
i_get_value_size(data_type_t type, const void *data, uint_t nelem)
i_get_value_size(data_type_t type, const void *data, uint_t nelem,
size_t max_size)
{
uint64_t value_sz;
if (max_size == 0)
max_size = INT32_MAX;
if (i_validate_type_nelem(type, nelem) != 0)
return (-1);
@@ -1052,10 +1064,15 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem)
break;
#endif
case DATA_TYPE_STRING:
if (data == NULL)
if (data == NULL) {
value_sz = 0;
else
value_sz = strlen(data) + 1;
} else {
value_sz = strnlen(data, max_size);
if (value_sz >= max_size) {
return (-1); /* string not terminated */
}
value_sz += 1;
}
break;
case DATA_TYPE_BOOLEAN_ARRAY:
value_sz = (uint64_t)nelem * sizeof (boolean_t);
@@ -1089,16 +1106,23 @@ i_get_value_size(data_type_t type, const void *data, uint_t nelem)
break;
case DATA_TYPE_STRING_ARRAY:
value_sz = (uint64_t)nelem * sizeof (uint64_t);
if (data != NULL) {
char *const *strs = data;
uint_t i;
size_t newsize;
/* no alignment requirement for strings */
for (i = 0; i < nelem; i++) {
if (strs[i] == NULL)
return (-1);
value_sz += strlen(strs[i]) + 1;
newsize = strnlen(strs[i], max_size);
if (newsize == max_size)
return (-1); /* not terminated */
value_sz += newsize + 1; /* +1 for NULL */
max_size -= newsize + 1;
}
}
break;
@@ -1163,7 +1187,7 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
* In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
* is the size of the string(s) included.
*/
if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
if ((value_sz = i_get_value_size(type, data, nelem, 0)) < 0)
return (EINVAL);
if (i_validate_nvpair_value(type, nelem, data) != 0)
@@ -1588,7 +1612,7 @@ nvpair_value_common(const nvpair_t *nvp, data_type_t type, uint_t *nelem,
#endif
if (data == NULL)
return (EINVAL);
if ((value_sz = i_get_value_size(type, NULL, 1)) < 0)
if ((value_sz = i_get_value_size(type, NULL, 1, 0)) < 0)
return (EINVAL);
memcpy(data, NVP_VALUE(nvp), (size_t)value_sz);
if (nelem != NULL)
@@ -3019,7 +3043,8 @@ nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
* In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
* is the size of the string(s) excluded.
*/
if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp),
NVP_SIZE(nvp))) < 0)
return (EFAULT);
if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
@@ -3333,7 +3358,7 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
* In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
* is the size of the string(s) excluded.
*/
if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
if ((value_sz = i_get_value_size(type, NULL, nelem, NVP_SIZE(nvp)) < 0))
return (EFAULT);
/* if there is no data to extract then return */
@@ -72,9 +72,6 @@ SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target,
param_set_arc_free_target, 0, CTLFLAG_RW,
"Desired number of free pages below which ARC triggers reclaim");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift,
param_set_arc_no_grow_shift, 0, ZMOD_RW,
"log2(fraction of ARC which must be free to allow growing)");
int64_t
arc_available_memory(void)
@@ -292,7 +292,7 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
{
int err, val;
val = arc_no_grow_shift;
val = zfs_arc_no_grow_shift;
err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
@@ -300,7 +300,7 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
if (val < 0 || val >= arc_shrink_shift)
return (EINVAL);
arc_no_grow_shift = val;
zfs_arc_no_grow_shift = val;
if (arg2 != 0)
warn_deprecated_sysctl("arc_no_grow_shift", "arc.no_grow_shift");
@@ -541,14 +541,14 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log,
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* The zfs_metaslab_condense_pct determines how much more compact the in-core
* space map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
extern uint_t zfs_condense_pct;
extern uint_t zfs_metaslab_condense_pct;
SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
CTLFLAG_RWTUN, &zfs_condense_pct, 0,
SYSCTL_UINT(_vfs_zfs, OID_AUTO, metaslab_condense_pct,
CTLFLAG_RWTUN, &zfs_metaslab_condense_pct, 0,
"Condense on-disk spacemap when it is more than this many percents"
" of in-memory counterpart");
@@ -617,18 +617,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval,
"Configuration cache file write, retry after failure, interval"
" (seconds)");
extern uint64_t zfs_max_missing_tvds_cachefile;
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile,
CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0,
"Allow importing pools with missing top-level vdevs in cache file");
extern uint64_t zfs_max_missing_tvds_scan;
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan,
CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0,
"Allow importing pools with missing top-level vdevs during scan");
/* spa_misc.c */
extern int zfs_flags;
@@ -188,6 +188,12 @@ spl_kvmalloc(size_t size, gfp_t lflags)
return (ptr);
}
/*
* vmalloc fallback. KM_VMEM may not have been requested originally if
* we've come through spl_kmem_alloc_impl(), so we need to remove
* __GFP_COMP, which is not a valid flag for vmalloc.
*/
lflags &= ~__GFP_COMP;
return (spl_vmalloc(size, lflags));
}
@@ -410,6 +410,24 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
return (0);
}
int
param_set_arc_no_grow_shift(const char *buf, zfs_kernel_param_t *kp)
{
unsigned long val;
int error;
error = kstrtoul(buf, 0, &val);
if (error)
return (SET_ERROR(error));
if (val >= arc_shrink_shift)
return (-SET_ERROR(EINVAL));
zfs_arc_no_grow_shift = val;
return (0);
}
int
param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp)
{
@@ -931,8 +931,14 @@ vdev_disk_io_rw(zio_t *zio)
return (SET_ERROR(EIO));
}
vdev_t *iter = v;
while (iter != NULL && iter->vdev_failfast == ZPROP_BOOLEAN_INHERIT)
iter = iter->vdev_parent;
boolean_t failfast = iter ? iter->vdev_failfast == 1 :
vdev_prop_default_numeric(VDEV_PROP_FAILFAST);
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
v->vdev_failfast == B_TRUE) {
failfast) {
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
}
@@ -1689,6 +1689,24 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs)
return (0);
}
/*
* Return a referenced znode at or after zp. The z_znodes_lock protects the
* list walk; the returned inode reference keeps the znode alive after the
* lock is dropped for zfs_rezget().
*/
static znode_t *
zfs_resume_hold_next_znode(zfsvfs_t *zfsvfs, znode_t *zp)
{
ASSERT(MUTEX_HELD(&zfsvfs->z_znodes_lock));
for (; zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) {
if (igrab(ZTOI(zp)) != NULL)
return (zp);
}
return (NULL);
}
/*
* Rebuild SA and release VOPs. Note that ownership of the underlying dataset
* is an invariant across any of the operations that can be performed while the
@@ -1732,13 +1750,23 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
* dbufs. If a zfs_rezget() fails, then we unhash the inode
* and mark it stale. This prevents a collision if a new
* inode/object is created which must use the same inode
* number. The stale inode will be be released when the
* VFS prunes the dentry holding the remaining references
* on the stale inode.
* number. The stale inode will be released when the VFS
* prunes the dentry holding the remaining references on
* the stale inode.
*
* zfs_rezget() takes the per-object znode hold lock. Pin each znode
* while holding z_znodes_lock, then drop the list lock before calling
* zfs_rezget() to preserve the normal zh_lock -> z_znodes_lock order.
*/
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp;
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
zp = zfs_resume_hold_next_znode(zfsvfs,
list_head(&zfsvfs->z_all_znodes));
while (zp != NULL) {
znode_t *next = zfs_resume_hold_next_znode(zfsvfs,
list_next(&zfsvfs->z_all_znodes, zp));
mutex_exit(&zfsvfs->z_znodes_lock);
err2 = zfs_rezget(zp);
if (err2) {
zpl_d_drop_aliases(ZTOI(zp));
@@ -1747,9 +1775,14 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
/* see comment in zfs_suspend_fs() */
if (zp->z_suspended) {
zfs_zrele_async(zp);
zp->z_suspended = B_FALSE;
zfs_zrele_async(zp);
}
zfs_zrele_async(zp);
mutex_enter(&zfsvfs->z_znodes_lock);
zp = next;
}
mutex_exit(&zfsvfs->z_znodes_lock);
@@ -2434,9 +2434,13 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
&zp->z_pflags, sizeof (zp->z_pflags));
if (attrzp) {
/*
* attrzp is zp's hidden xattr directory, so the second
* znode lock acquisition is nested rather than recursive.
*/
if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
mutex_enter(&attrzp->z_acl_lock);
mutex_enter(&attrzp->z_lock);
mutex_enter_nested(&attrzp->z_acl_lock, NESTED_SINGLE);
mutex_enter_nested(&attrzp->z_lock, NESTED_SINGLE);
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
sizeof (attrzp->z_pflags));
@@ -4074,18 +4078,32 @@ zfs_inactive(struct inode *ip)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
krwlock_t *zti_lock = &zfsvfs->z_teardown_inactive_lock;
uint64_t atime[2];
int error;
int need_unlock = 0;
boolean_t no_lockdep = B_FALSE;
/* Only read lock if we haven't already write locked, e.g. rollback */
if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
if (!RW_WRITE_HELD(zti_lock)) {
need_unlock = 1;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
/*
* kswapd reaches evict_inode() with fs_reclaim held. Suppress
* lockdep only for this reclaim-thread acquire/release pair.
*/
no_lockdep = current_is_reclaim_thread();
if (no_lockdep)
rw_enter_nolockdep(zti_lock, RW_READER);
else
rw_enter(zti_lock, RW_READER);
}
if (zp->z_sa_hdl == NULL) {
if (need_unlock)
rw_exit(&zfsvfs->z_teardown_inactive_lock);
if (need_unlock) {
if (no_lockdep)
rw_exit_nolockdep(zti_lock);
else
rw_exit(zti_lock);
}
return;
}
@@ -4111,8 +4129,12 @@ zfs_inactive(struct inode *ip)
}
zfs_zinactive(zp);
if (need_unlock)
rw_exit(&zfsvfs->z_teardown_inactive_lock);
if (need_unlock) {
if (no_lockdep)
rw_exit_nolockdep(zti_lock);
else
rw_exit(zti_lock);
}
}
/*
@@ -550,10 +550,11 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
*
* Finally, all filesystems get automatic handling for the 'source' option,
* that is, the "name" of the filesystem (the first column of df(1)'s output).
* However, this only happens if the handler does not otherwise handle
* the 'source' option. Since we handle _all_ options because of 'sloppy', we
* deal with this explicitly by calling into the kernel's helper for this,
* vfs_parse_fs_param_source(), which sets up fc->source.
* However, this only happens if the handler does not otherwise handle the
* 'source' option. Since we handle _all_ options because of 'sloppy', we have
* ot handle it ourselves. Normally we would call vfs_parse_fs_param_source()
* to deal with this, but that didn't appear until 5.14, and it's small enough
* that we can just handle it ourselves.
*
* source
*
@@ -565,6 +566,7 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
*/
enum {
Opt_source,
Opt_exec, Opt_suid, Opt_dev,
Opt_atime, Opt_relatime, Opt_strictatime,
Opt_saxattr, Opt_dirxattr, Opt_noxattr,
@@ -574,6 +576,8 @@ enum {
};
static const struct fs_parameter_spec zpl_param_spec[] = {
fsparam_string("source", Opt_source),
fsparam_flag_no("exec", Opt_exec),
fsparam_flag_no("suid", Opt_suid),
fsparam_flag_no("dev", Opt_dev),
@@ -609,18 +613,34 @@ static const struct fs_parameter_spec zpl_param_spec[] = {
{}
};
/*
* Before 5.6, fs_parse() took a struct fs_parameter_description
* which wraps the parameter specs with name and enum pointers. From 5.6,
* the description struct was removed and fs_parse() accepts the
* fs_parameter_spec directly.
*/
static int
zpl_fs_parse(struct fs_context *fc, struct fs_parameter *param,
struct fs_parse_result *result)
{
#ifdef HAVE_FS_PARSE_TAKES_SPEC
return (fs_parse(fc, zpl_param_spec, param, result));
#else
static const struct fs_parameter_description zpl_param_desc = {
.name = "zfs",
.specs = zpl_param_spec,
};
return (fs_parse(fc, &zpl_param_desc, param, result));
#endif
}
static int
zpl_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
vfs_t *vfs = fc->fs_private;
/* Handle 'source' explicitly so we don't trip on it as an unknown. */
int opt = vfs_parse_fs_param_source(fc, param);
if (opt != -ENOPARAM)
return (opt);
struct fs_parse_result result;
opt = fs_parse(fc, zpl_param_spec, param, &result);
int opt = zpl_fs_parse(fc, param, &result);
if (opt == -ENOPARAM) {
/*
* Convert unknowns to warnings, to work around the whole
@@ -632,6 +652,16 @@ zpl_parse_param(struct fs_context *fc, struct fs_parameter *param)
return (opt);
switch (opt) {
case Opt_source:
if (fc->source != NULL) {
cmn_err(CE_NOTE,
"ZFS: multiple 'source' options not supported");
return (-SET_ERROR(EINVAL));
}
fc->source = param->string;
param->string = NULL;
break;
case Opt_exec:
vfs->vfs_exec = !result.negated;
vfs->vfs_do_exec = B_TRUE;
@@ -794,7 +824,7 @@ zpl_parse_monolithic(struct fs_context *fc, void *data)
/* Check if this is one of our options. */
struct fs_parse_result result;
int opt = fs_parse(fc, zpl_param_spec, &param, &result);
int opt = zpl_fs_parse(fc, &param, &result);
if (opt >= 0) {
/*
* We already know this one of our options, so a
@@ -874,9 +904,14 @@ zpl_get_tree(struct fs_context *fc)
if (sb->s_root == NULL) {
vfs_t *vfs = fc->fs_private;
/* Apply readonly flag as mount option */
if (fc->sb_flags & SB_RDONLY) {
vfs->vfs_readonly = B_TRUE;
/*
* If SB_RDONLY was set/cleared from mount options, update
* them in the options struct so we set up the filesystem
* in the proper state.
*/
if (fc->sb_flags_mask & SB_RDONLY) {
vfs->vfs_readonly =
(fc->sb_flags & SB_RDONLY) ? B_TRUE : B_FALSE;
vfs->vfs_do_readonly = B_TRUE;
}
@@ -701,6 +701,24 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value,
* ZFS allows extended user attributes to be disabled administratively
* by setting the 'xattr=off' property on the dataset.
*/
/*
* Concatenate prefix + name into a NUL-terminated stack buffer.
* Linux fs/xattr.c (import_xattr_name) caps the full xattr name at
* XATTR_NAME_MAX before any handler runs, so XATTR_NAME_MAX + 1
* bytes always fit.
*/
static inline void
zpl_xattr_join_name(char *buf, size_t buflen, const char *prefix,
size_t prefix_len, const char *name, size_t name_len)
{
ASSERT3U(prefix_len + name_len + 1, <=, buflen);
memcpy(buf, prefix, prefix_len);
memcpy(buf + prefix_len, name, name_len);
buf[prefix_len + name_len] = '\0';
}
static int
__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
const char *name, size_t name_len)
@@ -726,9 +744,13 @@ __zpl_xattr_user_get(struct inode *ip, const char *name,
* try again without the namespace prefix for compatibility with
* other platforms.
*/
char *xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
char xattr_name[XATTR_NAME_MAX + 1];
zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN,
name, strlen(name));
error = zpl_xattr_get(ip, xattr_name, value, size);
kmem_strfree(xattr_name);
if (error == -ENODATA)
error = zpl_xattr_get(ip, name, value, size);
@@ -758,8 +780,13 @@ __zpl_xattr_user_set(zidmap_t *user_ns,
* XATTR_CREATE: fail if xattr already exists
* XATTR_REPLACE: fail if xattr does not exist
*/
char *prefixed_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
char prefixed_name[XATTR_NAME_MAX + 1];
const char *clear_name, *set_name;
zpl_xattr_join_name(prefixed_name, sizeof (prefixed_name),
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN,
name, strlen(name));
if (zfs_xattr_compat) {
clear_name = prefixed_name;
set_name = name;
@@ -776,7 +803,7 @@ __zpl_xattr_user_set(zidmap_t *user_ns,
* because it already exists. Stop here.
*/
if (error == -EEXIST)
goto out;
return (error);
/*
* If XATTR_REPLACE was specified and we succeeded to clear
* an xattr, we don't need to replace anything when setting
@@ -788,10 +815,7 @@ __zpl_xattr_user_set(zidmap_t *user_ns,
/*
* Set the new value with the configured name format.
*/
error = zpl_xattr_set(ip, set_name, value, size, flags);
out:
kmem_strfree(prefixed_name);
return (error);
return (zpl_xattr_set(ip, set_name, value, size, flags));
}
ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
@@ -824,17 +848,16 @@ static int
__zpl_xattr_trusted_get(struct inode *ip, const char *name,
void *value, size_t size)
{
char *xattr_name;
int error;
char xattr_name[XATTR_NAME_MAX + 1];
if (!capable(CAP_SYS_ADMIN))
return (-EACCES);
/* xattr_resolve_name will do this for us if this is defined */
xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
error = zpl_xattr_get(ip, xattr_name, value, size);
kmem_strfree(xattr_name);
return (error);
zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN,
name, strlen(name));
return (zpl_xattr_get(ip, xattr_name, value, size));
}
ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
@@ -844,17 +867,16 @@ __zpl_xattr_trusted_set(zidmap_t *user_ns,
const void *value, size_t size, int flags)
{
(void) user_ns;
char *xattr_name;
int error;
char xattr_name[XATTR_NAME_MAX + 1];
if (!capable(CAP_SYS_ADMIN))
return (-EACCES);
/* xattr_resolve_name will do this for us if this is defined */
xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
error = zpl_xattr_set(ip, xattr_name, value, size, flags);
kmem_strfree(xattr_name);
return (error);
zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN,
name, strlen(name));
return (zpl_xattr_set(ip, xattr_name, value, size, flags));
}
ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
@@ -889,14 +911,13 @@ static int
__zpl_xattr_security_get(struct inode *ip, const char *name,
void *value, size_t size)
{
char *xattr_name;
int error;
/* xattr_resolve_name will do this for us if this is defined */
xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
error = zpl_xattr_get(ip, xattr_name, value, size);
kmem_strfree(xattr_name);
char xattr_name[XATTR_NAME_MAX + 1];
return (error);
zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN,
name, strlen(name));
return (zpl_xattr_get(ip, xattr_name, value, size));
}
ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
@@ -906,14 +927,13 @@ __zpl_xattr_security_set(zidmap_t *user_ns,
const void *value, size_t size, int flags)
{
(void) user_ns;
char *xattr_name;
int error;
/* xattr_resolve_name will do this for us if this is defined */
xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
error = zpl_xattr_set(ip, xattr_name, value, size, flags);
kmem_strfree(xattr_name);
char xattr_name[XATTR_NAME_MAX + 1];
return (error);
zpl_xattr_join_name(xattr_name, sizeof (xattr_name),
XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN,
name, strlen(name));
return (zpl_xattr_set(ip, xattr_name, value, size, flags));
}
ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
@@ -238,11 +238,11 @@ _VALSTR_BITFIELD_IMPL(zio_stage,
{ 'E', "EN", "ENCRYPT" },
{ 'C', "CG", "CHECKSUM_GENERATE" },
{ 'N', "NW", "NOP_WRITE" },
{ 'B', "BF", "BRT_FREE" },
{ 'd', "dS", "DDT_READ_START" },
{ 'd', "dD", "DDT_READ_DONE" },
{ 'd', "dW", "DDT_WRITE" },
{ 'd', "dF", "DDT_FREE" },
{ 'B', "BF", "BRT_FREE" },
{ 'G', "GA", "GANG_ASSEMBLE" },
{ 'G', "GI", "GANG_ISSUE" },
{ 'D', "DT", "DVA_THROTTLE" },
@@ -374,10 +374,16 @@ vdev_prop_init(void)
{ "on", 1},
{ NULL }
};
static const zprop_index_t boolean_inherit_table[] = {
{ "off", 0},
{ "on", 1},
{ "inherit", ZPROP_BOOLEAN_INHERIT},
{ NULL }
};
static const zprop_index_t boolean_na_table[] = {
{ "off", 0},
{ "on", 1},
{ "-", 2}, /* ZPROP_BOOLEAN_NA */
{ "-", ZPROP_BOOLEAN_NA},
{ NULL }
};
@@ -388,6 +394,14 @@ vdev_prop_init(void)
{ NULL }
};
static const zprop_index_t vdev_alloc_bias_table[] = {
{ "none", VDEV_BIAS_NONE },
{ "log", VDEV_BIAS_LOG },
{ "special", VDEV_BIAS_SPECIAL },
{ "dedup", VDEV_BIAS_DEDUP },
{ NULL }
};
struct zfs_mod_supported_features *sfeatures =
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
@@ -547,8 +561,8 @@ vdev_prop_init(void)
/* default index properties */
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "FAILFAST", boolean_table,
sfeatures);
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off | inherit", "FAILFAST",
boolean_inherit_table, sfeatures);
zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events",
B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off",
"SLOW_IO_EVENTS", boolean_table, sfeatures);
@@ -556,6 +570,13 @@ vdev_prop_init(void)
VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
"auto | on | off", "IO_SCHEDULER",
vdevschedulertype_table, sfeatures);
zprop_register_index(VDEV_PROP_ALLOC_BIAS, "alloc_bias",
VDEV_BIAS_NONE, PROP_DEFAULT, ZFS_TYPE_VDEV,
"none | log | special | dedup", "ALLOC_BIAS",
vdev_alloc_bias_table, sfeatures);
zprop_register_index(VDEV_PROP_ROTATIONAL, "rotational", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "ROTATIONAL",
boolean_table, sfeatures);
/* hidden properties */
zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
+12 -18
View File
@@ -398,14 +398,14 @@ uint_t zfs_arc_pc_percent = 0;
/*
* log2(fraction of ARC which must be free to allow growing).
* I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
* I.e. If there is less than arc_c >> zfs_arc_no_grow_shift free memory,
* when reading a new block into the ARC, we will evict an equal-sized block
* from the ARC.
*
* This must be less than arc_shrink_shift, so that when we shrink the ARC,
* we will still not allow it to grow.
*/
uint_t arc_no_grow_shift = 5;
uint_t zfs_arc_no_grow_shift = 5;
/*
@@ -586,6 +586,7 @@ arc_stats_t arc_stats = {
{ "uncached_metadata", KSTAT_DATA_UINT64 },
{ "uncached_evictable_data", KSTAT_DATA_UINT64 },
{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
{ "l2_ndev", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_prefetch_asize", KSTAT_DATA_UINT64 },
@@ -4975,7 +4976,7 @@ arc_reap_cb_check(void *arg, zthr_t *zthr)
*/
arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
return (B_TRUE);
} else if (free_memory < arc_c >> arc_no_grow_shift) {
} else if (free_memory < arc_c >> zfs_arc_no_grow_shift) {
arc_no_grow = B_TRUE;
} else if (gethrtime() >= arc_growtime) {
arc_no_grow = B_FALSE;
@@ -5571,20 +5572,6 @@ arc_buf_access(arc_buf_t *buf)
!HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
}
/* a generic arc_read_done_func_t which you can use */
void
arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *arg)
{
(void) zio, (void) zb, (void) bp;
if (buf == NULL)
return;
memcpy(arg, buf->b_data, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
/* a generic arc_read_done_func_t */
void
arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
@@ -7440,6 +7427,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
aggsum_value(&arc_sums.arcstat_dnode_size);
as->arcstat_bonus_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size);
as->arcstat_l2_ndev.value.ui64 = l2arc_ndev;
as->arcstat_l2_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_l2_hits);
as->arcstat_l2_misses.value.ui64 =
@@ -7654,7 +7642,8 @@ arc_tuning_update(boolean_t verbose)
/* Valid range: 1 - N */
if (zfs_arc_shrink_shift) {
arc_shrink_shift = zfs_arc_shrink_shift;
arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
zfs_arc_no_grow_shift = MIN(zfs_arc_no_grow_shift,
arc_shrink_shift - 1);
}
/* Valid range: 1 - N ms */
@@ -11683,6 +11672,7 @@ EXPORT_SYMBOL(arc_write);
EXPORT_SYMBOL(arc_read);
EXPORT_SYMBOL(arc_buf_info);
EXPORT_SYMBOL(arc_getbuf_func);
EXPORT_SYMBOL(arc_buf_destroy);
EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);
@@ -11701,6 +11691,10 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift,
param_set_arc_no_grow_shift, param_get_uint, ZMOD_RW,
"log2(fraction of ARC which must be free to allow growing)");
#ifdef _KERNEL
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
"Percent of pagecache to reclaim ARC to");
+2 -2
View File
@@ -221,7 +221,7 @@ ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
B_FALSE, dlu, &dlu->dlu_ndbp, &dlu->dlu_dbp,
DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));
dlu->dlu_tx = tx;
@@ -338,7 +338,7 @@ ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
*/
dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, dlu);
ddt->ddt_log_active->ddl_length +=
dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+1 -1
View File
@@ -1859,7 +1859,7 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
&cookie)) != NULL) {
/*
* os_userused_lock protects against concurrent calls to
* zap_increment_int(). It's needed because zap_increment_int()
* zap_increment(). It's needed because zap_increment()
* is not thread-safe (i.e. not atomic).
*/
mutex_enter(&os->os_userused_lock);
+31 -13
View File
@@ -2901,16 +2901,20 @@ receive_read_record(dmu_recv_cookie_t *drc)
{
struct drr_object *drro =
&drc->drc_rrd->header.drr_u.drr_object;
uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
uint32_t size;
void *buf = NULL;
dmu_object_info_t doi;
size = DRR_OBJECT_PAYLOAD_SIZE(drro);
if (size > SPA_MAXBLOCKSIZE)
return (SET_ERROR(ERANGE));
if (size != 0)
buf = kmem_zalloc(size, KM_SLEEP);
buf = vmem_zalloc(size, KM_SLEEP);
err = receive_read_payload_and_next_header(drc, size, buf);
if (err != 0) {
kmem_free(buf, size);
vmem_free(buf, size);
return (err);
}
err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
@@ -2934,7 +2938,11 @@ receive_read_record(dmu_recv_cookie_t *drc)
case DRR_WRITE:
{
struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
uint64_t size = DRR_WRITE_PAYLOAD_SIZE(drrw);
if (size > SPA_MAXBLOCKSIZE)
return (SET_ERROR(ERANGE));
abd_t *abd = abd_alloc_linear(size, B_FALSE);
err = receive_read_payload_and_next_header(drc, size,
abd_to_buf(abd));
@@ -2951,12 +2959,18 @@ receive_read_record(dmu_recv_cookie_t *drc)
{
struct drr_write_embedded *drrwe =
&drc->drc_rrd->header.drr_u.drr_write_embedded;
uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
void *buf = kmem_zalloc(size, KM_SLEEP);
uint32_t size;
void *buf;
size = P2ROUNDUP(drrwe->drr_psize, 8);
if (size > SPA_MAXBLOCKSIZE)
return (SET_ERROR(ERANGE));
buf = vmem_zalloc(size, KM_SLEEP);
err = receive_read_payload_and_next_header(drc, size, buf);
if (err != 0) {
kmem_free(buf, size);
vmem_free(buf, size);
return (err);
}
@@ -2985,7 +2999,11 @@ receive_read_record(dmu_recv_cookie_t *drc)
case DRR_SPILL:
{
struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
uint64_t size = DRR_SPILL_PAYLOAD_SIZE(drrs);
if (size > SPA_MAXBLOCKSIZE)
return (SET_ERROR(ERANGE));
abd_t *abd = abd_alloc_linear(size, B_FALSE);
err = receive_read_payload_and_next_header(drc, size,
abd_to_buf(abd));
@@ -3136,7 +3154,7 @@ receive_process_record(struct receive_writer_arg *rwa,
abd_free(rrd->abd);
rrd->abd = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
vmem_free(rrd->payload, rrd->payload_size);
rrd->payload = NULL;
}
return (0);
@@ -3150,7 +3168,7 @@ receive_process_record(struct receive_writer_arg *rwa,
rrd->abd = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
vmem_free(rrd->payload, rrd->payload_size);
rrd->payload = NULL;
}
@@ -3163,7 +3181,7 @@ receive_process_record(struct receive_writer_arg *rwa,
{
struct drr_object *drro = &rrd->header.drr_u.drr_object;
err = receive_object(rwa, drro, rrd->payload);
kmem_free(rrd->payload, rrd->payload_size);
vmem_free(rrd->payload, rrd->payload_size);
rrd->payload = NULL;
break;
}
@@ -3201,7 +3219,7 @@ receive_process_record(struct receive_writer_arg *rwa,
struct drr_write_embedded *drrwe =
&rrd->header.drr_u.drr_write_embedded;
err = receive_write_embedded(rwa, drrwe, rrd->payload);
kmem_free(rrd->payload, rrd->payload_size);
vmem_free(rrd->payload, rrd->payload_size);
rrd->payload = NULL;
break;
}
@@ -3270,7 +3288,7 @@ receive_writer_thread(void *arg)
rrd->abd = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
vmem_free(rrd->payload, rrd->payload_size);
rrd->payload = NULL;
}
/*
+35 -2
View File
@@ -2241,6 +2241,37 @@ setup_send_progress(struct dmu_send_params *dspp)
return (dssp);
}
/*
* Payloads must be multiples of 8 bytes for historical compatibility, but
* XDR-encoded nvlists are sized in multiples of 4 bytes and may need padding.
*
* Here we do the simplest possible thing and copy the data to a separate
* buffer. Not ideal in terms of performance and memory use, but most BEGIN
* nvlists are small or absent, the allocation is momentary, and we'll need
* to do this at most once per dataset.
*
* It's OK if there is extra data after a packed nvlist on the receiving
* side because packed nvlists have an internal end-of-list marker.
*
* The new buffer is allocated with kmem_alloc() and can be freed with
* fnvlist_pack_free(), like the original.
*/
static inline void
pad_packed_nvlist(char **buffer, size_t *size)
{
size_t size_in = *size;
size_t extra_bytes = P2ROUNDUP(size_in, 8) - size_in;
if (extra_bytes != 0) {
size_t expanded_size = size_in + extra_bytes;
char *longbuf = kmem_alloc(expanded_size, KM_SLEEP);
memcpy(longbuf, *buffer, size_in);
memset(longbuf + size_in, 0, extra_bytes);
fnvlist_pack_free(*buffer, size_in);
*buffer = longbuf;
*size = expanded_size;
}
}
/*
* Actually do the bulk of the work in a zfs send.
*
@@ -2474,7 +2505,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
dsl_pool_rele(dp, tag);
void *payload = NULL;
char *payload = NULL;
size_t payload_len = 0;
nvlist_t *nvl = fnvlist_alloc();
@@ -2548,7 +2579,9 @@ dmu_send_impl(struct dmu_send_params *dspp)
}
if (!nvlist_empty(nvl)) {
payload = fnvlist_pack(nvl, &payload_len);
VERIFY0(nvlist_pack(nvl, &payload, &payload_len,
NV_ENCODE_XDR, KM_SLEEP));
pad_packed_nvlist(&payload, &payload_len);
drr->drr_payloadlen = payload_len;
}
@@ -490,7 +490,7 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
} else {
dmu_buf_t *db;
VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
DB_RF_MUST_SUCCEED, FTAG, &db));
DB_RF_MUST_SUCCEED, tag, &db));
dmu_buf_will_fill(db, tx, B_FALSE);
VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
SPA_MINBLOCKSIZE), tx));
+68 -25
View File
@@ -1534,9 +1534,28 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
}
/* call from syncing context when we actually write/free space for this dd */
void
dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
static void dsl_dir_diduse_transfer_space_impl(dsl_dir_t *dd, int64_t used,
int64_t compressed, int64_t uncompressed, int64_t tonew,
dd_used_t oldtype, dd_used_t newtype, boolean_t nested, dmu_tx_t *tx);
static void
dsl_dir_lock_enter(dsl_dir_t *dd, boolean_t nested)
{
/*
* lockdep needs an explicit subclass when a child dd_lock
* nests an ancestor.
*/
if (nested) {
mutex_enter_nested(&dd->dd_lock, NESTED_SINGLE);
} else {
mutex_enter(&dd->dd_lock);
}
}
static void
dsl_dir_diduse_space_impl(dsl_dir_t *dd, dd_used_t type,
int64_t used, int64_t compressed, int64_t uncompressed,
boolean_t nested, dmu_tx_t *tx)
{
int64_t accounted_delta;
@@ -1554,7 +1573,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
*/
boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
if (needlock)
mutex_enter(&dd->dd_lock);
dsl_dir_lock_enter(dd, nested);
dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
@@ -1582,12 +1601,20 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
mutex_exit(&dd->dd_lock);
if (dd->dd_parent != NULL) {
dsl_dir_diduse_transfer_space(dd->dd_parent,
dsl_dir_diduse_transfer_space_impl(dd->dd_parent,
accounted_delta, compressed, uncompressed,
used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
used, DD_USED_CHILD_RSRV, DD_USED_CHILD, nested, tx);
}
}
void
dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used,
int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
{
dsl_dir_diduse_space_impl(dd, type, used, compressed, uncompressed,
B_FALSE, tx);
}
void
dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
@@ -1612,10 +1639,10 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
mutex_exit(&dd->dd_lock);
}
void
dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
static void
dsl_dir_diduse_transfer_space_impl(dsl_dir_t *dd, int64_t used,
int64_t compressed, int64_t uncompressed, int64_t tonew,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
dd_used_t oldtype, dd_used_t newtype, boolean_t nested, dmu_tx_t *tx)
{
int64_t accounted_delta;
@@ -1625,7 +1652,7 @@ dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
dsl_dir_lock_enter(dd, nested);
dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
@@ -1656,12 +1683,21 @@ dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
mutex_exit(&dd->dd_lock);
if (dd->dd_parent != NULL) {
dsl_dir_diduse_transfer_space(dd->dd_parent,
dsl_dir_diduse_transfer_space_impl(dd->dd_parent,
accounted_delta, compressed, uncompressed,
used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
used, DD_USED_CHILD_RSRV, DD_USED_CHILD, nested, tx);
}
}
void
dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
int64_t compressed, int64_t uncompressed, int64_t tonew,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
{
dsl_dir_diduse_transfer_space_impl(dd, used, compressed,
uncompressed, tonew, oldtype, newtype, B_FALSE, tx);
}
typedef struct dsl_dir_set_qr_arg {
const char *ddsqra_name;
zprop_source_t ddsqra_source;
@@ -1828,8 +1864,8 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
if (dd->dd_parent != NULL) {
/* Roll up this additional usage into our ancestors */
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
delta, 0, 0, tx);
dsl_dir_diduse_space_impl(dd->dd_parent, DD_USED_CHILD_RSRV,
delta, 0, 0, B_TRUE, tx);
}
mutex_exit(&dd->dd_lock);
}
@@ -2268,22 +2304,29 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
{
dsl_pool_t *dp = dmu_tx_pool(tx);
inode_timespec_t t;
ASSERT(dsl_pool_sync_context(dp));
gethrestime(&t);
mutex_enter(&dd->dd_lock);
dd->dd_snap_cmtime = t;
if (spa_feature_is_enabled(dp->dp_spa,
SPA_FEATURE_EXTENSIBLE_DATASET)) {
objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t ddobj = dd->dd_object;
dsl_dir_zapify(dd, tx);
VERIFY0(zap_update(mos, ddobj,
DD_FIELD_SNAPSHOTS_CHANGED,
sizeof (uint64_t),
sizeof (inode_timespec_t) / sizeof (uint64_t),
&t, tx));
}
mutex_exit(&dd->dd_lock);
if (!spa_feature_is_enabled(dp->dp_spa,
SPA_FEATURE_EXTENSIBLE_DATASET)) {
return;
}
objset_t *mos = dd->dd_pool->dp_meta_objset;
/*
* dsl_dir_zapify() and zap_update() may dirty buffers and recurse
* into space accounting, so do not call them with dd_lock held.
*/
dsl_dir_zapify(dd, tx);
VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_SNAPSHOTS_CHANGED,
sizeof (uint64_t),
sizeof (inode_timespec_t) / sizeof (uint64_t), &t, tx));
}
void
@@ -1280,6 +1280,7 @@ dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
scn->errorscrub_phys.dep_paused_flags = B_TRUE;
dsl_errorscrub_sync_state(scn, tx);
zap_cursor_fini(&scn->errorscrub_cursor);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
} else {
ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+5 -1
View File
@@ -96,13 +96,17 @@ zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
/* check if hardware accelerator can be used */
if (qat_dc_use_accel(d_len)) {
if (qat_compress(QAT_DECOMPRESS, s_start, s_len,
d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS)
d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS) {
if ((size_t)dstlen == d_len)
return (0);
}
/* if hardware de-compress fail, do it again with software */
}
if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK)
return (-1);
if ((size_t)dstlen != d_len)
return (-1);
return (0);
}
+11 -4
View File
@@ -89,17 +89,24 @@ zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
(void) n;
const char *src = s_start;
uint32_t bufsiz = BE_IN32(src);
int decoded;
/* invalid compressed buffer size encoded at start */
if (bufsiz + sizeof (bufsiz) > s_len)
return (1);
/*
* Returns 0 on success (decompression function returned non-negative)
* and non-zero on failure (decompression function returned negative).
* LZ4_uncompress_unknownOutputSize returns the number of bytes decoded
* on success, or a negative value on failure. An OpenZFS block must
* expand to exactly d_len bytes
*/
return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
d_start, bufsiz, d_len) < 0);
decoded = LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
d_start, bufsiz, d_len);
if (decoded < 0)
return (1);
if (d_len != (size_t)decoded)
return (1);
return (0);
}
ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
+26 -5
View File
@@ -82,11 +82,11 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17);
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
* The zfs_metaslab_condense_pct determines how much more compact the in-core
* space map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
uint_t zfs_condense_pct = 200;
uint_t zfs_metaslab_condense_pct = 200;
/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
@@ -3826,8 +3826,8 @@ metaslab_group_preload(metaslab_group_t *mg)
* increase as a result of writing out the free space range tree.
*
* 2. Condense if the on on-disk space map representation is at least
* zfs_condense_pct/100 times the size of the optimal representation
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
* zfs_metaslab_condense_pct/100 times the size of the optimal representation
* (i.e. zfs_metaslab_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
*
* 3. Do not condense if the on-disk size of the space map does not actually
* decrease.
@@ -3863,7 +3863,8 @@ metaslab_should_condense(metaslab_t *msp)
uint64_t optimal_size = space_map_estimate_optimal_size(sm,
msp->ms_allocatable, SM_NO_VDEVID);
return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
return (object_size >=
(optimal_size * zfs_metaslab_condense_pct / 100) &&
object_size > zfs_metaslab_condense_block_threshold * record_size);
}
@@ -6442,6 +6443,14 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
"When looking in size tree, use largest segment instead of exact fit");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_alloc_threshold, U64, ZMOD_RW,
"Minimum size which forces the dynamic allocator to change its "
"allocation strategy");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_free_pct, UINT, ZMOD_RW,
"The minimum free space, in percent, to continue allocations in a "
"first-fit fashion");
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
@@ -6454,6 +6463,18 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
"Normally only consider this many of the best metaslabs in each vdev");
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, sm_blksz_no_log, INT, ZMOD_RW,
"Block size for space map in pools with log space map disabled. "
"Power of 2 greater than 4096.");
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, sm_blksz_with_log, INT, ZMOD_RW,
"Block size for space map in pools with log space map enabled. "
"Power of 2 greater than 4096.");
ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
param_set_active_allocator, param_get_charp, ZMOD_RW,
"SPA active allocator");
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, condense_pct, UINT, ZMOD_RW,
"Condense on-disk spacemap when it is more than this many percents "
"of in-memory counterpart");
+2 -2
View File
@@ -1605,8 +1605,8 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
mutex_enter(&hdl->sa_lock);
mutex_enter(&zp->z_lock);
mutex_enter(&hdl->sa_lock);
err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
sizeof (uint64_t));
@@ -1750,8 +1750,8 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
zp->z_is_sa = B_TRUE;
out:
mutex_exit(&zp->z_lock);
mutex_exit(&hdl->sa_lock);
mutex_exit(&zp->z_lock);
kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
if (dxattr_obj)
+22 -7
View File
@@ -8333,12 +8333,20 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
return (spa_vdev_exit(spa, newrootvd, txg, error));
/*
* log, dedup and special vdevs should not be replaced by spares.
* Spares can't replace logs
*/
if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* For special and dedup vdevs a spare must have matching rotational
* characteristics. A rotating spare replacing a non-rotating vdev
* would silently degrade pool performance, so we reject the mismatch.
*/
if (newvd->vdev_isspare &&
oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE &&
newvd->vdev_nonrot != oldvd->vdev_nonrot)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
/*
* A dRAID spare can only replace a child of its parent dRAID vdev.
@@ -11011,6 +11019,10 @@ spa_sync(spa_t *spa, uint64_t txg)
ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
}
for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
vdev_sync_dispatch(vd, txg);
spa_sync_rewrite_vdev_config(spa, tx);
dmu_tx_commit(tx);
@@ -11035,9 +11047,6 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);
/*
* Update usable space statistics.
*/
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
!= NULL)
vdev_sync_done(vd, txg);
@@ -11811,6 +11820,12 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
"Allow importing pool with up to this number of missing top-level "
"vdevs (in read-only mode)");
ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds_cachefile, U64, ZMOD_RW,
"Allow importing pools with missing top-level vdevs in cache file");
ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds_scan, U64, ZMOD_RW,
"Allow importing pools with missing top-level vdevs during scan");
ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
ZMOD_RW, "Set the livelist condense zthr to pause");
+165 -19
View File
@@ -460,6 +460,7 @@ vdev_prop_get_objid(vdev_t *vd, uint64_t *objid)
} else if (vd->vdev_leaf_zap != 0) {
*objid = vd->vdev_leaf_zap;
} else {
*objid = 0;
return (EINVAL);
}
@@ -474,8 +475,11 @@ vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
uint64_t objid;
int err;
if (vdev_prop_get_objid(vd, &objid) != 0)
return (EINVAL);
if (vdev_prop_get_objid(vd, &objid) != 0) {
/* No ZAP: property was never set, return the default. */
*value = vdev_prop_default_numeric(prop);
return (ENOENT);
}
err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
sizeof (uint64_t), 1, value);
@@ -963,6 +967,20 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
/*
* Restore the last-known rotational status for leaf vdevs. vdev_open()
* will overwrite this with the hardware value when the device is
* accessible; the persisted value acts as a fallback for failed or
* missing devices so that spare selection can still match on device
* type even when the original disk is gone.
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t rotational = 0;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL,
&rotational) == 0)
vd->vdev_nonrot = !rotational;
}
vic = &vd->vdev_indirect_config;
ASSERT0(vic->vic_mapping_object);
@@ -1117,6 +1135,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
vd->vdev_autosit =
vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
if (ops == &vdev_root_ops)
vd->vdev_failfast =
vdev_prop_default_numeric(VDEV_PROP_FAILFAST);
else
vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT;
/*
* Add ourselves to the parent's list of children.
@@ -3912,10 +3935,9 @@ vdev_load(vdev_t *vd)
vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
1, &failfast);
if (error == 0) {
vd->vdev_failfast = failfast & 1;
vd->vdev_failfast = failfast;
} else if (error == ENOENT) {
vd->vdev_failfast = vdev_prop_default_numeric(
VDEV_PROP_FAILFAST);
vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT;
} else {
vdev_dbgmsg(vd,
"vdev_load: zap_lookup(top_zap=%llu) "
@@ -4224,17 +4246,39 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
static void
metaslab_sync_done_task(void *arg)
{
metaslab_t *msp = arg;
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
metaslab_sync_done(msp, spa_syncing_txg(spa));
}
void
vdev_sync_dispatch(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
ASSERT(vdev_is_concrete(vd));
for (metaslab_t *msp = txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg));
msp; msp = txg_list_next(&vd->vdev_ms_list, msp, TXG_CLEAN(txg))) {
(void) taskq_dispatch(spa->spa_sync_tq,
metaslab_sync_done_task, msp, TQ_SLEEP);
}
}
void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
ASSERT(vdev_is_concrete(vd));
while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
!= NULL)
metaslab_sync_done(msp, txg);
taskq_wait(vd->vdev_spa->spa_sync_tq);
while (txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)) != NULL)
;
if (reassess) {
metaslab_sync_reassess(vd->vdev_mg);
@@ -6093,6 +6137,29 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
strval);
}
break;
case VDEV_PROP_ALLOC_BIAS: {
intval = fnvpair_value_uint64(elem);
ASSERT3U(intval, !=, VDEV_BIAS_LOG);
const char *bias_str =
(intval == VDEV_BIAS_SPECIAL) ?
VDEV_ALLOC_BIAS_SPECIAL :
(intval == VDEV_BIAS_DEDUP) ?
VDEV_ALLOC_BIAS_DEDUP : NULL;
if (bias_str == NULL) {
(void) zap_remove(mos, objid,
VDEV_TOP_ZAP_ALLOCATION_BIAS, tx);
} else {
VERIFY0(zap_update(mos, objid,
VDEV_TOP_ZAP_ALLOCATION_BIAS,
1, strlen(bias_str) + 1, bias_str, tx));
spa_activate_allocation_classes(spa, tx);
}
spa_history_log_internal(spa, "vdev set", tx,
"vdev_guid=%llu: alloc_bias=%s",
(u_longlong_t)vdev_guid,
bias_str != NULL ? bias_str : "none");
break;
}
default:
/* normalize the property name */
propname = vdev_prop_to_name(prop);
@@ -6207,11 +6274,14 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
error = spa_vdev_alloc(spa, vdev_guid);
break;
case VDEV_PROP_FAILFAST:
if (nvpair_value_uint64(elem, &intval) != 0) {
if (nvpair_value_uint64(elem, &intval) != 0 ||
intval > ZPROP_BOOLEAN_INHERIT ||
(intval == ZPROP_BOOLEAN_INHERIT &&
vd->vdev_ops == &vdev_root_ops)) {
error = EINVAL;
break;
}
vd->vdev_failfast = intval & 1;
vd->vdev_failfast = intval;
break;
case VDEV_PROP_SIT_OUT:
/* Only expose this for a draid or raidz leaf */
@@ -6319,6 +6389,53 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_scheduler = intval;
break;
case VDEV_PROP_ALLOC_BIAS:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
if (vd != vd->vdev_top || vd->vdev_top_zap == 0) {
error = ENOTSUP;
break;
}
/* Log vdevs are not supported: remove and re-add. */
if (vd->vdev_islog) {
error = ENOTSUP;
break;
}
/* special/dedup needs allocation_classes feature */
if (intval != VDEV_BIAS_NONE &&
((intval != VDEV_BIAS_SPECIAL &&
intval != VDEV_BIAS_DEDUP) ||
!spa_feature_is_enabled(spa,
SPA_FEATURE_ALLOCATION_CLASSES))) {
error = ENOTSUP;
break;
}
/*
* Disallow converting the last normal vdev to
* avoid pool suspension on failed allocations.
*/
if (intval != VDEV_BIAS_NONE &&
vd->vdev_alloc_bias == VDEV_BIAS_NONE) {
vdev_t *rvd = spa->spa_root_vdev;
int normal = 0;
for (uint64_t c = 0;
c < rvd->vdev_children; c++) {
vdev_t *cvd = rvd->vdev_child[c];
if (vdev_is_concrete(cvd) &&
cvd->vdev_alloc_bias ==
VDEV_BIAS_NONE &&
!cvd->vdev_noalloc)
normal++;
}
if (normal <= 1) {
error = ENOTSUP;
break;
}
}
vd->vdev_alloc_bias = (vdev_alloc_bias_t)intval;
break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
@@ -6350,7 +6467,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
int err = 0;
uint64_t objid;
uint64_t objid = 0;
uint64_t vdev_guid;
nvpair_t *elem = NULL;
nvlist_t *nvprops = NULL;
@@ -6369,9 +6486,15 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
if (vdev_prop_get_objid(vd, &objid) != 0)
return (SET_ERROR(EINVAL));
ASSERT(objid != 0);
/*
* A missing ZAP is normal for spare and L2ARC vdevs, which are
* not part of the main vdev tree and never get ZAPs allocated.
* Many properties are sourced directly from vdev_t fields and
* work fine without one; ZAP-backed properties will return their
* default values. objid is set to 0 when absent and the few
* cases that call zap_lookup directly guard against this below.
*/
(void) vdev_prop_get_objid(vd, &objid);
mutex_enter(&spa->spa_props_lock);
@@ -6694,18 +6817,28 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
break;
case VDEV_PROP_FAILFAST:
src = ZPROP_SRC_LOCAL;
strval = NULL;
err = zap_lookup(mos, objid, nvpair_name(elem),
if (objid != 0) {
err = zap_lookup(mos, objid,
nvpair_name(elem),
sizeof (uint64_t), 1, &intval);
} else {
err = ENOENT;
}
if (err == ENOENT) {
intval = vdev_prop_default_numeric(
if (vd->vdev_ops == &vdev_root_ops)
intval =
vdev_prop_default_numeric(
prop);
else
intval = ZPROP_BOOLEAN_INHERIT;
err = 0;
} else if (err) {
break;
}
if (intval == vdev_prop_default_numeric(prop))
if (intval == ZPROP_BOOLEAN_INHERIT ||
(vd->vdev_ops == &vdev_root_ops &&
intval == 1))
src = ZPROP_SRC_DEFAULT;
vdev_prop_add_list(outnvl, propname, strval,
@@ -6746,6 +6879,17 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
vdev_prop_add_list(outnvl, propname, NULL,
boolval, src);
break;
case VDEV_PROP_ALLOC_BIAS:
if (vd == vd->vdev_top) {
vdev_prop_add_list(outnvl, propname,
NULL, vd->vdev_alloc_bias,
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_ROTATIONAL:
vdev_prop_add_list(outnvl, propname, NULL,
!vd->vdev_nonrot, ZPROP_SRC_NONE);
continue;
case VDEV_PROP_CHECKSUM_N:
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
@@ -6771,6 +6915,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
/* FALLTHRU */
case VDEV_PROP_USERPROP:
/* User Properites */
if (objid == 0)
continue;
src = ZPROP_SRC_LOCAL;
err = zap_length(mos, objid, nvpair_name(elem),
@@ -467,6 +467,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd->vdev_top != NULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
vd->vdev_top->vdev_guid);
}
if (vd->vdev_path != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
@@ -493,6 +498,11 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_wholedisk);
}
if (vd->vdev_ops->vdev_op_leaf) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL,
!vd->vdev_nonrot);
}
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
@@ -502,6 +512,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (flags & VDEV_CONFIG_L2CACHE)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
if ((flags & VDEV_CONFIG_SPARE) && vd->vdev_asize != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
@@ -1392,6 +1405,7 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
VB_NVLIST);
break;
}
vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
}
@@ -102,14 +102,14 @@
#define WVR(X) [w##X] "=w" (w##X)
#define UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
#define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
#define UVR0_(REG, ...) [w##REG] "+w" (w##REG)
#define UVR1_(_1, REG, ...) [w##REG] "+w" (w##REG)
#define UVR2_(_1, _2, REG, ...) [w##REG] "+w" (w##REG)
#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+w" (w##REG)
#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+w" (w##REG)
#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+w" (w##REG)
#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+w" (w##REG)
#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+w" (w##REG)
#define UVR0(r...) UVR0_(r)
#define UVR1(r...) UVR1_(r)
@@ -120,7 +120,7 @@
#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
#define UVR(X) [w##X] "+&w" (w##X)
#define UVR(X) [w##X] "+w" (w##X)
#define R_01(REG1, REG2, ...) REG1, REG2
#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+558
View File
@@ -0,0 +1,558 @@
// SPDX-License-Identifier: CDDL-1.0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2024, Klara, Inc.
* Copyright (c) 2026, TrueNAS.
*/
#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/dsl_dataset.h>
#include <sys/zap.h>
#include <sys/zap_impl.h>
static kmem_cache_t *zap_name_cache;
static kmem_cache_t *zap_attr_cache;
static kmem_cache_t *zap_name_long_cache;
static kmem_cache_t *zap_attr_long_cache;
/* Setup/teardown caches. Part of the public interface in zap.h. */
void
zap_init(void)
{
zap_name_cache = kmem_cache_create("zap_name",
sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
NULL, NULL, NULL, 0);
zap_attr_cache = kmem_cache_create("zap_attr_cache",
sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL,
NULL, NULL, NULL, NULL, 0);
zap_name_long_cache = kmem_cache_create("zap_name_long",
sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
NULL, NULL, NULL, 0);
zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL,
NULL, NULL, NULL, NULL, 0);
}
void
zap_fini(void)
{
kmem_cache_destroy(zap_name_cache);
kmem_cache_destroy(zap_attr_cache);
kmem_cache_destroy(zap_name_long_cache);
kmem_cache_destroy(zap_attr_long_cache);
}
static int
zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
size_t outlen)
{
ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
size_t inlen = strlen(name) + 1;
int err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
U8_UNICODE_LATEST, &err);
return (err);
}
zap_name_t *
zap_name_alloc(zap_t *zap, boolean_t longname)
{
kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
zn->zn_zap = zap;
zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
return (zn);
}
zap_name_t *
zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
{
size_t key_len = strlen(key) + 1;
zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
if (zap_name_init_str(zn, key, mt) != 0) {
zap_name_free(zn);
return (NULL);
}
return (zn);
}
zap_name_t *
zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
{
zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
ASSERT0(zap->zap_normflags);
zn->zn_zap = zap;
zn->zn_key_intlen = sizeof (*key);
zn->zn_key_orig = zn->zn_key_norm = key;
zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
zn->zn_matchtype = 0;
zn->zn_normbuf_len = ZAP_MAXNAMELEN;
zn->zn_hash = zap_hash(zn);
return (zn);
}
void
zap_name_free(zap_name_t *zn)
{
if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
kmem_cache_free(zap_name_cache, zn);
} else {
ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
kmem_cache_free(zap_name_long_cache, zn);
}
}
int
zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
{
zap_t *zap = zn->zn_zap;
size_t key_len = strlen(key) + 1;
/* Make sure zn is allocated for longname if key is long */
IMPLY(key_len > ZAP_MAXNAMELEN,
zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
zn->zn_key_intlen = sizeof (*key);
zn->zn_key_orig = key;
zn->zn_key_orig_numints = key_len;
zn->zn_matchtype = mt;
zn->zn_normflags = zap->zap_normflags;
/*
* If we're dealing with a case sensitive lookup on a mixed or
* insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
* will fold case to all caps overriding the lookup request.
*/
if (mt & MT_MATCH_CASE)
zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
if (zap->zap_normflags) {
/*
* We *must* use zap_normflags because this normalization is
* what the hash is computed from.
*/
if (zap_normalize(zap, key, zn->zn_normbuf,
zap->zap_normflags, zn->zn_normbuf_len) != 0)
return (SET_ERROR(ENOTSUP));
zn->zn_key_norm = zn->zn_normbuf;
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} else {
if (mt != 0)
return (SET_ERROR(ENOTSUP));
zn->zn_key_norm = zn->zn_key_orig;
zn->zn_key_norm_numints = zn->zn_key_orig_numints;
}
zn->zn_hash = zap_hash(zn);
if (zap->zap_normflags != zn->zn_normflags) {
/*
* We *must* use zn_normflags because this normalization is
* what the matching is based on. (Not the hash!)
*/
if (zap_normalize(zap, key, zn->zn_normbuf,
zn->zn_normflags, zn->zn_normbuf_len) != 0)
return (SET_ERROR(ENOTSUP));
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
}
return (0);
}
boolean_t
zap_match(zap_name_t *zn, const char *matchname)
{
boolean_t res = B_FALSE;
ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
if (zn->zn_matchtype & MT_NORMALIZE) {
size_t namelen = zn->zn_normbuf_len;
char normbuf[ZAP_MAXNAMELEN];
char *norm = normbuf;
/*
* Cannot allocate this on-stack as it exceed the stack-limit of
* 1024.
*/
if (namelen > ZAP_MAXNAMELEN)
norm = kmem_alloc(namelen, KM_SLEEP);
if (zap_normalize(zn->zn_zap, matchname, norm,
zn->zn_normflags, namelen) != 0) {
res = B_FALSE;
} else {
res = (strcmp(zn->zn_key_norm, norm) == 0);
}
if (norm != normbuf)
kmem_free(norm, namelen);
} else {
res = (strcmp(zn->zn_key_orig, matchname) == 0);
}
return (res);
}
uint64_t
zap_hash(zap_name_t *zn)
{
zap_t *zap = zn->zn_zap;
uint64_t h = 0;
if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
h = *(uint64_t *)zn->zn_key_orig;
} else {
h = zap->zap_salt;
ASSERT(h != 0);
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
const uint64_t *wp = zn->zn_key_norm;
ASSERT(zn->zn_key_intlen == 8);
for (int i = 0; i < zn->zn_key_norm_numints;
wp++, i++) {
uint64_t word = *wp;
for (int j = 0; j < 8; j++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ word) & 0xFF];
word >>= NBBY;
}
}
} else {
const uint8_t *cp = zn->zn_key_norm;
/*
* We previously stored the terminating null on
* disk, but didn't hash it, so we need to
* continue to not hash it. (The
* zn_key_*_numints includes the terminating
* null for non-binary keys.)
*/
int len = zn->zn_key_norm_numints - 1;
ASSERT(zn->zn_key_intlen == 1);
for (int i = 0; i < len; cp++, i++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ *cp) & 0xFF];
}
}
}
/*
* Don't use all 64 bits, since we need some in the cookie for
* the collision differentiator. We MUST use the high bits,
* since those are the ones that we first pay attention to when
* choosing the bucket.
*/
h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
return (h);
}
static int
zap_lock_impl(dnode_t *dn, dmu_buf_t *db, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
ASSERT0(db->db_offset);
objset_t *os = dmu_buf_get_objset(db);
uint64_t obj = db->db_object;
*zapp = NULL;
if (DMU_OT_BYTESWAP(dn->dn_type) != DMU_BSWAP_ZAP)
return (SET_ERROR(EINVAL));
zap_t *zap = dmu_buf_get_user(db);
if (zap == NULL) {
zap = mzap_open(db);
if (zap == NULL) {
/*
* mzap_open() didn't like what it saw on-disk.
* Check for corruption!
*/
return (SET_ERROR(EIO));
}
}
/*
* We're checking zap_ismicro without the lock held, in order to
* tell what type of lock we want. Once we have some sort of
* lock, see if it really is the right type. In practice this
* can only be different if it was upgraded from micro to fat,
* and micro wanted WRITER but fat only needs READER.
*/
krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
rw_enter(&zap->zap_rwlock, lt);
if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
/* it was upgraded, now we only need reader */
ASSERT(lt == RW_WRITER);
ASSERT(RW_READER ==
((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
rw_downgrade(&zap->zap_rwlock);
lt = RW_READER;
}
zap->zap_objset = os;
zap->zap_dnode = dn;
if (lt == RW_WRITER)
dmu_buf_will_dirty(db, tx);
ASSERT3P(zap->zap_dbuf, ==, db);
ASSERT(!zap->zap_ismicro ||
zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
if (zap->zap_ismicro && tx && adding &&
zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
dprintf("upgrading obj %llu: num_entries=%u\n",
(u_longlong_t)obj, zap->zap_m.zap_num_entries);
*zapp = zap;
int err = mzap_upgrade(zapp, tx, 0);
if (err != 0)
rw_exit(&zap->zap_rwlock);
return (err);
}
VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
zap->zap_m.zap_num_chunks =
db->db_size / MZAP_ENT_LEN - 1;
if (newsz > SPA_OLD_MAXBLOCKSIZE) {
dsl_dataset_t *ds = dmu_objset_ds(os);
if (!dsl_dataset_feature_is_active(ds,
SPA_FEATURE_LARGE_MICROZAP)) {
/*
* A microzap just grew beyond the old limit
* for the first time, so we have to ensure the
* feature flag is activated.
* zap_get_micro_max_size() won't let us get
* here if the feature is not enabled, so we
* don't need any other checks beforehand.
*
* Since we're in open context, we can't
* activate the feature directly, so we instead
* flag it on the dataset for next sync.
*/
dsl_dataset_dirty(ds, tx);
mutex_enter(&ds->ds_lock);
ds->ds_feature_activation
[SPA_FEATURE_LARGE_MICROZAP] =
(void *)B_TRUE;
mutex_exit(&ds->ds_lock);
}
}
}
*zapp = zap;
return (0);
}
int
zap_lock_by_dnode(dnode_t *dn, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
zap_t **zapp)
{
dmu_buf_t *db;
int err;
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
err = zap_lock_impl(dn, db, tx, lti, fatreader, adding, zapp);
if (err != 0)
dmu_buf_rele(db, tag);
else
VERIFY(dnode_add_ref(dn, tag));
return (err);
}
int
zap_lock(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
zap_t **zapp)
{
dnode_t *dn;
int err;
err = dnode_hold(os, obj, tag, &dn);
if (err != 0)
return (err);
err = zap_lock_by_dnode(dn, tx, lti, fatreader, adding, tag, zapp);
dnode_rele(dn, tag);
return (err);
}
void
zap_unlock(zap_t *zap, const void *tag)
{
rw_exit(&zap->zap_rwlock);
dnode_rele(zap->zap_dnode, tag);
dmu_buf_rele(zap->zap_dbuf, tag);
}
int
zap_lock_try_upgrade(zap_t *zap, dmu_tx_t *tx)
{
if (RW_WRITE_HELD(&zap->zap_rwlock))
/* Already have writer, nothing to do. */
return (1);
/* Try to upgrade the lock in-place. */
if (rw_tryupgrade(&zap->zap_rwlock)) {
/*
* Got it, mark buffer dirty, since we only do that in
* zap_lock_impl() for writer.
*/
dmu_buf_will_dirty(zap->zap_dbuf, tx);
return (1);
}
return (0);
}
void
zap_lock_upgrade(zap_t *zap, dmu_tx_t *tx)
{
if (zap_lock_try_upgrade(zap, tx))
return;
/*
* It's safe to drop the lock here because we still have a hold on
* zap_dbuf, which prevents the dbuf being evicted and the zap_t being
* deallocated.
*/
rw_exit(&zap->zap_rwlock);
rw_enter(&zap->zap_rwlock, RW_WRITER);
dmu_buf_will_dirty(zap->zap_dbuf, tx);
}
void
zap_evict_sync(void *dbu)
{
zap_t *zap = dbu;
rw_destroy(&zap->zap_rwlock);
if (zap->zap_ismicro)
mze_destroy(zap);
else
mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
kmem_free(zap, sizeof (zap_t));
}
uint64_t
zap_getflags(zap_t *zap)
{
if (zap->zap_ismicro)
return (0);
return (zap_f_phys(zap)->zap_flags);
}
int
zap_hashbits(zap_t *zap)
{
if (zap_getflags(zap) & ZAP_FLAG_HASH64)
return (48);
else
return (28);
}
uint32_t
zap_maxcd(zap_t *zap)
{
if (zap_getflags(zap) & ZAP_FLAG_HASH64)
return ((1<<16)-1);
else
return (-1U);
}
/* DNU byteswap callback for DMU_BSWAP_ZAP, see dmu_ot_byteswap. */
void
zap_byteswap(void *buf, size_t size)
{
uint64_t block_type = *(uint64_t *)buf;
if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
/* ASSERT(magic == ZAP_LEAF_MAGIC); */
mzap_byteswap(buf, size);
} else {
fzap_byteswap(buf, size);
}
}
/*
* Cursor attribute allocator/free. Part of the public interface in zap.h,
* in this file to get access to the kmem caches.
*/
static zap_attribute_t *
zap_attribute_alloc_impl(boolean_t longname)
{
zap_attribute_t *za;
za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
KM_SLEEP);
za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
return (za);
}
zap_attribute_t *
zap_attribute_alloc(void)
{
return (zap_attribute_alloc_impl(B_FALSE));
}
zap_attribute_t *
zap_attribute_long_alloc(void)
{
return (zap_attribute_alloc_impl(B_TRUE));
}
void
zap_attribute_free(zap_attribute_t *za)
{
if (za->za_name_len == ZAP_MAXNAMELEN) {
kmem_cache_free(zap_attr_cache, za);
} else {
ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
kmem_cache_free(zap_attr_long_cache, za);
}
}

Some files were not shown because too many files have changed in this diff Show More