Notable upstream pull request merges:
 #16307 1d43387dd zdb: Add -O option for -r to specify object-id
 #17965 a62c62120 ARC: Pre-convert zfs_arc_min_prefetch_ms
 #17970 d393166c5 ARC: Increase parallel eviction batching
 #17981 20f09eae4 ZIO: ZIO_STAGE_DDT_WRITE is a blocking stage
 #17983 ff47dd35e Fix ddtprune causing space leak
 #18015 86b064469 FreeBSD: Fix a potential null dereference
                  in zfs_freebsd_fsync() (already merged)
 #18020 ff47dd35e Ensure 64-bit `off_t` is used in user space
                  instead of `loff_t`
 #18028 09492e0f2 Reduce dataset buffers re-dirtying
 #18033 f72fd378c Defer async destroys on pool import
 #18043 3d76ba273 Improve async destroy processing timing
 #18044 46d6f1fe5 DDT: Move logs searches out of the lock
 #18047 ff5414406 DDT: Switch to using ZAP _by_dnode() interfaces
 #18048 3b1ff816b DDT: Add/use zap_lookup_length_uint64_by_dnode()
 #18055 22e89aca8 DDT: Fix compressed entry buffer size
 #18059 0550abd4b RAIDZ: Remove some excessive logging
 #18060 a83bb15fc Reduce minimal scrub/resilver times
 #18061 962e68865 Use reduced precision for scan times
 #18063 051a8c749 Bypass snprintf() in quota checks if no quotas set
 #18064 7ff329ac2 Fix rangelock test for growing block size

Obtained from:	OpenZFS
OpenZFS commit:	962e68865e
This commit is contained in:
Martin Matuska
2025-12-19 21:44:42 +01:00
61 changed files with 1344 additions and 721 deletions
@@ -13,6 +13,20 @@ set -eu
# handle on what the timeout value should be.
(while [ 1 ] ; do sleep 30 && echo "[watchdog: $(ps -eo cmd --sort=-pcpu | head -n 2 | tail -n 1)}')]"; done) &
# The default 'azure.archive.ubuntu.com' mirrors can be really slow.
# Prioritize the official Ubuntu mirrors.
#
# The normal apt-mirrors.txt will look like:
#
# http://azure.archive.ubuntu.com/ubuntu/ priority:1
# https://archive.ubuntu.com/ubuntu/ priority:2
# https://security.ubuntu.com/ubuntu/ priority:3
#
# Just delete the 'azure.archive.ubuntu.com' line.
sudo sed -i '/azure.archive.ubuntu.com/d' /etc/apt/apt-mirrors.txt
echo "Using mirrors:"
cat /etc/apt/apt-mirrors.txt
# install needed packages
export DEBIAN_FRONTEND="noninteractive"
sudo apt-get -y update
@@ -95,13 +95,6 @@ case "$OS" in
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
NIC="rtl8139"
;;
freebsd14-2r)
FreeBSD="14.2-RELEASE"
OSNAME="FreeBSD $FreeBSD"
OSv="freebsd14.0"
URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
;;
freebsd14-3r)
FreeBSD="14.3-RELEASE"
OSNAME="FreeBSD $FreeBSD"
@@ -60,20 +60,16 @@ jobs:
ref: ${{ github.event.pull_request.head.sha }}
- name: Setup QEMU
timeout-minutes: 10
run: .github/workflows/scripts/qemu-1-setup.sh
- name: Start build machine
timeout-minutes: 10
run: .github/workflows/scripts/qemu-2-start.sh ${{ matrix.os }}
- name: Install dependencies
timeout-minutes: 20
run: |
.github/workflows/scripts/qemu-3-deps.sh ${{ matrix.os }}
- name: Build modules or Test repo
timeout-minutes: 60
run: |
set -e
if [ "${{ github.event.inputs.test_type }}" == "Test repo" ] ; then
@@ -94,7 +90,6 @@ jobs:
- name: Prepare artifacts
if: always()
timeout-minutes: 10
run: |
rsync -a zfs@vm0:/tmp/repo /tmp || true
.github/workflows/scripts/replace-dupes-with-symlinks.sh /tmp/repo
+4 -4
View File
@@ -46,7 +46,7 @@ jobs:
os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora41", "fedora42", "fedora43", "ubuntu22", "ubuntu24"]'
;;
freebsd)
os_selection='["freebsd13-5r", "freebsd14-2r", "freebsd14-3r", "freebsd13-5s", "freebsd14-3s", "freebsd15-0s", "freebsd16-0c"]'
os_selection='["freebsd13-5r", "freebsd14-3r", "freebsd13-5s", "freebsd14-3s", "freebsd15-0s", "freebsd16-0c"]'
;;
*)
# default list
@@ -76,7 +76,7 @@ jobs:
# debian: debian12, debian13, ubuntu22, ubuntu24
# misc: archlinux, tumbleweed
# FreeBSD variants of november 2025:
# FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r
# FreeBSD Release: freebsd13-5r, freebsd14-3r, freebsd15-0r
# FreeBSD Stable: freebsd13-5s, freebsd14-3s, freebsd15-0s
# FreeBSD Current: freebsd16-0c
os: ${{ fromJson(needs.test-config.outputs.test_os) }}
@@ -87,7 +87,7 @@ jobs:
ref: ${{ github.event.pull_request.head.sha }}
- name: Setup QEMU
timeout-minutes: 20
timeout-minutes: 60
run: |
# Add a timestamp to each line to debug timeouts
while IFS=$'\n' read -r line; do
@@ -99,7 +99,7 @@ jobs:
run: .github/workflows/scripts/qemu-2-start.sh ${{ matrix.os }}
- name: Install dependencies
timeout-minutes: 20
timeout-minutes: 60
run: .github/workflows/scripts/qemu-3-deps.sh ${{ matrix.os }} ${{ github.event.inputs.fedora_kernel_ver }}
- name: Build modules
+1 -1
View File
@@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
Linux-Maximum: 6.17
Linux-Maximum: 6.18
Linux-Minimum: 4.18
+8 -3
View File
@@ -739,13 +739,14 @@ usage(void)
"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
"\t%s -O [-K <key>] <dataset> <path>\n"
"\t%s -r [-K <key>] <dataset> <path> <destination>\n"
"\t%s -r [-K <key>] -O <dataset> <object-id> <destination>\n"
"\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
"\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
"\t%s -E [-A] word0:word1:...:word15\n"
"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
"<poolname>\n\n",
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
cmdname, cmdname, cmdname, cmdname, cmdname);
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
(void) fprintf(stderr, " Dataset name must include at least one "
"separator character '/' or '@'\n");
@@ -9956,7 +9957,7 @@ main(int argc, char **argv)
* which imports the pool to the namespace if it's
* not in the cachefile.
*/
if (dump_opt['O']) {
if (dump_opt['O'] && !dump_opt['r']) {
if (argc != 2)
usage();
dump_opt['v'] = verbose + 3;
@@ -9969,7 +9970,11 @@ main(int argc, char **argv)
if (argc != 3)
usage();
dump_opt['v'] = verbose;
error = dump_path(argv[0], argv[1], &object);
if (dump_opt['O']) {
object = strtoull(argv[1], NULL, 0);
} else {
error = dump_path(argv[0], argv[1], &object);
}
if (error != 0)
fatal("internal error: %s", strerror(error));
}
@@ -0,0 +1,23 @@
dnl #
dnl # 6.18: some architectures and config option causes the kasan_ inline
dnl # functions to reference the GPL-only symbol 'kasan_flag_enabled',
dnl # breaking the build. Detect this and work
dnl # around it.
AC_DEFUN([ZFS_AC_KERNEL_SRC_KASAN_ENABLED], [
ZFS_LINUX_TEST_SRC([kasan_enabled], [
#include <linux/kasan.h>
], [
kasan_enabled();
], [], [ZFS_META_LICENSE])
])
AC_DEFUN([ZFS_AC_KERNEL_KASAN_ENABLED], [
AC_MSG_CHECKING([whether kasan_enabled() is GPL-only])
ZFS_LINUX_TEST_RESULT([kasan_enabled_license], [
AC_MSG_RESULT(no)
], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_KASAN_ENABLED_GPL_ONLY, 1,
[kasan_enabled() is GPL-only])
])
])
+2
View File
@@ -138,6 +138,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
ZFS_AC_KERNEL_SRC_NAMESPACE
ZFS_AC_KERNEL_SRC_INODE_GENERIC_DROP
ZFS_AC_KERNEL_SRC_KASAN_ENABLED
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -260,6 +261,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_SOPS_FREE_INODE
ZFS_AC_KERNEL_NAMESPACE
ZFS_AC_KERNEL_INODE_GENERIC_DROP
ZFS_AC_KERNEL_KASAN_ENABLED
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE
@@ -0,0 +1,21 @@
dnl #
dnl # ZFS_AC_CONFIG_USER_LARGEFILE
dnl #
dnl # Ensure off_t is 64-bit for large file support in userspace.
dnl # This is required for OpenZFS to handle files larger than 2GB.
dnl #
AC_DEFUN([ZFS_AC_CONFIG_USER_LARGEFILE], [
AC_SYS_LARGEFILE
AC_CHECK_SIZEOF([off_t])
AC_MSG_CHECKING([for 64-bit off_t])
AS_IF([test "$ac_cv_sizeof_off_t" -ne 8], [
AC_MSG_RESULT([no, $ac_cv_sizeof_off_t bytes])
AC_MSG_FAILURE([
*** OpenZFS userspace requires 64-bit off_t support for large files.
*** Please ensure your system supports large file operations.
*** Current off_t size: $ac_cv_sizeof_off_t bytes])
], [
AC_MSG_RESULT([yes, $ac_cv_sizeof_off_t bytes])
])
])
+1
View File
@@ -3,6 +3,7 @@ dnl # Default ZFS user configuration
dnl #
AC_DEFUN([ZFS_AC_CONFIG_USER], [
ZFS_AC_CONFIG_USER_GETTEXT
ZFS_AC_CONFIG_USER_LARGEFILE
ZFS_AC_CONFIG_USER_MOUNT_HELPER
ZFS_AC_CONFIG_USER_SYSVINIT
ZFS_AC_CONFIG_USER_DRACUT
File diff suppressed because it is too large Load Diff
+4
View File
@@ -60,6 +60,10 @@ ZPOOL_IMPORT_ALL_VISIBLE='no'
# This is a space separated list.
#ZFS_POOL_EXCEPTIONS="test2"
# Additional important (operating system) file systems to mount beside
# the root file system.
#ZFS_INITRD_ADDITIONAL_DATASETS="rpool/usr rpool/var rpool/var/spool"
# Should the datasets be mounted verbosely?
# A mount counter will be used when mounting if set to 'yes'.
VERBOSE_MOUNT='no'
+12 -12
View File
@@ -26,13 +26,13 @@ fi
# Of course the functions we need are called differently
# on different distributions - it would be way too easy
# otherwise!!
if type log_failure_msg > /dev/null 2>&1 ; then
if command -v log_failure_msg > /dev/null 2>&1 ; then
# LSB functions - fall through
zfs_log_begin_msg() { log_begin_msg "$1"; }
zfs_log_end_msg() { log_end_msg "$1"; }
zfs_log_failure_msg() { log_failure_msg "$1"; }
zfs_log_progress_msg() { log_progress_msg "$1"; }
elif type success > /dev/null 2>&1 ; then
elif command -v success > /dev/null 2>&1 ; then
# Fedora/RedHat functions
zfs_set_ifs() {
# For some reason, the init function library have a problem
@@ -64,7 +64,7 @@ elif type success > /dev/null 2>&1 ; then
zfs_set_ifs "$TMP_IFS"
}
zfs_log_progress_msg() { printf "%s" "$""$1"; }
elif type einfo > /dev/null 2>&1 ; then
elif command -v einfo > /dev/null 2>&1 ; then
# Gentoo functions
zfs_log_begin_msg() { ebegin "$1"; }
zfs_log_end_msg() { eend "$1"; }
@@ -109,7 +109,7 @@ fi
# ----------------------------------------------------
export ZFS ZED ZPOOL ZPOOL_CACHE ZFS_LOAD_KEY ZFS_UNLOAD_KEY ZFS_MOUNT ZFS_UNMOUNT \
ZFS_SHARE ZFS_UNSHARE
ZFS_SHARE ZFS_UNSHARE ZFS_POOL_EXCEPTIONS ZFS_INITRD_ADDITIONAL_DATASETS
zfs_action()
{
@@ -140,7 +140,7 @@ zfs_daemon_start()
local PIDFILE="$1"; shift
local DAEMON_BIN="$1"; shift
if type start-stop-daemon > /dev/null 2>&1 ; then
if command -v start-stop-daemon > /dev/null 2>&1 ; then
# LSB functions
start-stop-daemon --start --quiet --pidfile "$PIDFILE" \
--exec "$DAEMON_BIN" --test > /dev/null || return 1
@@ -157,7 +157,7 @@ zfs_daemon_start()
then
ln -sf "$PIDFILE" /run/sendsigs.omit.d/zed
fi
elif type daemon > /dev/null 2>&1 ; then
elif command -v daemon > /dev/null 2>&1 ; then
# Fedora/RedHat functions
# shellcheck disable=SC2086
daemon --pidfile "$PIDFILE" "$DAEMON_BIN" "$@"
@@ -182,7 +182,7 @@ zfs_daemon_stop()
local DAEMON_BIN="$2"
local DAEMON_NAME="$3"
if type start-stop-daemon > /dev/null 2>&1 ; then
if command -v start-stop-daemon > /dev/null 2>&1 ; then
# LSB functions
start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 \
--pidfile "$PIDFILE" --name "$DAEMON_NAME"
@@ -190,7 +190,7 @@ zfs_daemon_stop()
[ "$ret" = 0 ] && rm -f "$PIDFILE"
return "$ret"
elif type killproc > /dev/null 2>&1 ; then
elif command -v killproc > /dev/null 2>&1 ; then
# Fedora/RedHat functions
killproc -p "$PIDFILE" "$DAEMON_NAME"
ret="$?"
@@ -212,11 +212,11 @@ zfs_daemon_status()
local DAEMON_BIN="$2"
local DAEMON_NAME="$3"
if type status_of_proc > /dev/null 2>&1 ; then
if command -v status_of_proc > /dev/null 2>&1 ; then
# LSB functions
status_of_proc "$DAEMON_NAME" "$DAEMON_BIN"
return $?
elif type status > /dev/null 2>&1 ; then
elif command -v status > /dev/null 2>&1 ; then
# Fedora/RedHat functions
status -p "$PIDFILE" "$DAEMON_NAME"
return $?
@@ -233,12 +233,12 @@ zfs_daemon_reload()
local PIDFILE="$1"
local DAEMON_NAME="$2"
if type start-stop-daemon > /dev/null 2>&1 ; then
if command -v start-stop-daemon > /dev/null 2>&1 ; then
# LSB functions
start-stop-daemon --stop --signal 1 --quiet \
--pidfile "$PIDFILE" --name "$DAEMON_NAME"
return $?
elif type killproc > /dev/null 2>&1 ; then
elif command -v killproc > /dev/null 2>&1 ; then
# Fedora/RedHat functions
killproc -p "$PIDFILE" "$DAEMON_NAME" -HUP
return $?
+7 -4
View File
@@ -213,6 +213,7 @@ typedef enum {
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
#define DDE_FLAG_FROM_FLUSHING (1 << 3) /* loaded from flushing log */
/*
* Additional data to support entry update or repair. This is fixed size
@@ -280,13 +281,14 @@ typedef struct {
*/
typedef struct {
kmutex_t ddt_lock; /* protects changes to all fields */
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
avl_tree_t ddt_log_tree; /* logged entries */
avl_tree_t ddt_repair_tree; /* entries being repaired */
ddt_log_t ddt_log[2]; /* active/flushing logs */
/*
* Log trees are stable during I/O, and only modified during sync
* with exclusive access.
*/
ddt_log_t ddt_log[2] ____cacheline_aligned; /* logged entries */
ddt_log_t *ddt_log_active; /* pointers into ddt_log */
ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
@@ -324,6 +326,7 @@ typedef struct {
/* per-type/per-class entry store objects */
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
dnode_t *ddt_object_dnode[DDT_TYPES][DDT_CLASSES];
/* object ids for stored, logged and per-type/per-class stats */
uint64_t ddt_stat_object;
+14 -17
View File
@@ -69,8 +69,8 @@ extern "C" {
* the live tree.
*/
typedef struct {
ddt_key_t ddle_key; /* ddt_log_tree key */
avl_node_t ddle_node; /* ddt_log_tree node */
ddt_key_t ddle_key; /* ddl_tree key */
avl_node_t ddle_node; /* ddl_tree node */
ddt_type_t ddle_type; /* storage type */
ddt_class_t ddle_class; /* storage class */
@@ -163,21 +163,18 @@ typedef struct {
int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const void *phys, size_t psize,
int (*ddt_op_lookup)(dnode_t *dn, const ddt_key_t *ddk,
void *phys, size_t psize);
int (*ddt_op_contains)(dnode_t *dn, const ddt_key_t *ddk);
void (*ddt_op_prefetch)(dnode_t *dn, const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(dnode_t *dn);
int (*ddt_op_update)(dnode_t *dn, const ddt_key_t *ddk,
const void *phys, size_t psize, dmu_tx_t *tx);
int (*ddt_op_remove)(dnode_t *dn, const ddt_key_t *ddk,
dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
int (*ddt_op_walk)(dnode_t *dn, uint64_t *walk, ddt_key_t *ddk,
void *phys, size_t psize);
int (*ddt_op_count)(dnode_t *dn, uint64_t *count);
} ddt_ops_t;
extern const ddt_ops_t ddt_zap_ops;
@@ -193,7 +190,7 @@ extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe);
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing);
extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk);
+1 -1
View File
@@ -157,7 +157,7 @@ typedef struct dsl_scan {
/* per txg statistics */
uint64_t scn_visited_this_txg; /* total bps visited this txg */
uint64_t scn_dedup_frees_this_txg; /* dedup bps freed this txg */
uint64_t scn_async_frees_this_txg; /* async frees (dedup/clone/gang) */
uint64_t scn_holes_this_txg;
uint64_t scn_lt_min_this_txg;
uint64_t scn_gt_max_this_txg;
+6
View File
@@ -226,6 +226,9 @@ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
uint64_t *actual_num_integers);
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
int zap_prefetch_object(objset_t *os, uint64_t zapobj);
@@ -288,6 +291,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers);
int zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers);
/*
* Remove the specified attribute.
@@ -309,6 +314,7 @@ int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
* object.
*/
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
int zap_count_by_dnode(dnode_t *dn, uint64_t *count);
/*
* Returns (in name) the name of the entry whose (value & mask)
+2 -1
View File
@@ -219,7 +219,8 @@ void fzap_byteswap(void *buf, size_t size);
int fzap_count(zap_t *zap, uint64_t *count);
int fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp);
char *realname, int rn_len, boolean_t *normalization_conflictp,
uint64_t *actual_num_integers);
void fzap_prefetch(zap_name_t *zn);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, const void *tag, dmu_tx_t *tx);
@@ -25,6 +25,17 @@
#include <sys/zfs_context.h>
/*
* loff_t is a Linux kernel/VFS type. glibc and musl expose it to user
* space via <fcntl.h>, but FreeBSD libc does not. For FreeBSD user
* space we map loff_t to off_t so the shared interfaces that use the
* loff_t name still compile. The FreeBSD kernel gets loff_t from its
* own linux-compat headers.
*/
#if !defined(_KERNEL) && defined(__FreeBSD__)
typedef off_t loff_t;
#endif
#ifndef _KERNEL
typedef struct zfs_file {
int f_fd;
+2 -1
View File
@@ -278,7 +278,8 @@ enum zio_stage {
ZIO_VDEV_IO_STAGES)
#define ZIO_BLOCKING_STAGES \
(ZIO_STAGE_DVA_ALLOCATE | \
(ZIO_STAGE_DDT_WRITE | \
ZIO_STAGE_DVA_ALLOCATE | \
ZIO_STAGE_DVA_CLAIM | \
ZIO_STAGE_VDEV_IO_START)
@@ -86,6 +86,6 @@ extern void kstat_delete(kstat_t *);
extern void kstat_set_raw_ops(kstat_t *ksp,
int (*headers)(char *buf, size_t size),
int (*data)(char *buf, size_t size, void *data),
void *(*addr)(kstat_t *ksp, loff_t index));
void *(*addr)(kstat_t *ksp, off_t index));
#endif /* _SYS_KSTAT_H */
@@ -50,19 +50,4 @@ typedef int projid_t;
#include <sys/param.h> /* for NBBY */
#ifdef __FreeBSD__
typedef off_t loff_t;
#endif
/*
* On musl, loff_t is a macro within fcntl.h when _GNU_SOURCE is defined.
* If no macro is defined, a typedef fallback is provided.
*/
#if defined(__linux__) && !defined(__GLIBC__)
#include <fcntl.h>
#ifndef loff_t
typedef off_t loff_t;
#endif
#endif
#endif
+1 -1
View File
@@ -58,7 +58,7 @@ void
kstat_set_raw_ops(kstat_t *ksp,
int (*headers)(char *buf, size_t size),
int (*data)(char *buf, size_t size, void *data),
void *(*addr)(kstat_t *ksp, loff_t index))
void *(*addr)(kstat_t *ksp, off_t index))
{
(void) ksp, (void) headers, (void) data, (void) addr;
}
+26 -5
View File
@@ -771,6 +771,12 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
This batch-style operation prevents entire sub-lists from being evicted at once
but comes at a cost of additional unlocking and locking.
.
.It Sy zfs_arc_evict_batches_limit Ns = Ns Sy 5 Pq uint
Number of
.Sy zfs_arc_evict_batch_limit
batches to process per parallel eviction task under heavy load to reduce number
of context switches.
.
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
Sets the number of ARC eviction threads to be used.
.Pp
@@ -1462,8 +1468,13 @@ Enable/disable the processing of the free_bpobj object.
.It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64
Maximum number of blocks freed in a single TXG.
.
.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64
Maximum number of dedup blocks freed in a single TXG.
.It Sy zfs_max_async_dedup_frees Ns = Ns Sy 250000 Pq u64
Maximum number of dedup, clone or gang blocks freed in a single TXG.
These frees may require additional I/O, making them more expensive.
.
.It Sy zfs_async_free_zio_wait_interval Ns = Ns Sy 2000 Pq u64
After freeing this many dedup, clone or gang blocks wait for all pending
I/Os to complete before continuing.
.
.It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint
Maximum asynchronous read I/O operations active to each device.
@@ -1733,7 +1744,7 @@ but we chose the more conservative approach of not setting it,
so that there is no possibility of
leaking space in the "partial temporary" failure case.
.
.It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint
.It Sy zfs_free_min_time_ms Ns = Ns Sy 500 Ns ms Po 1s Pc Pq uint
During a
.Nm zfs Cm destroy
operation using the
@@ -1761,6 +1772,16 @@ Blocks that go to the special vdevs are still written indirectly, as with
.Sy logbias Ns = Ns Sy throughput .
This parameter is ignored if an SLOG is present.
.
.It Sy zfs_import_defer_txgs Ns = Ns Sy 5 Pq uint
Number of transaction groups to wait after pool import before starting
background work such as asynchronous block freeing
.Pq from snapshots, clones, and deduplication
and scrub or resilver operations.
This allows the pool import and filesystem mounting to complete more quickly
without interference from background activities.
The default value of 5 transaction groups typically provides sufficient time
for import and mount operations to complete on most systems.
.
.It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64
Pattern written to vdev free space by
.Xr zpool-initialize 8 .
@@ -2095,7 +2116,7 @@ even if the
.Sy resilver_defer
feature is enabled.
.
.It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq uint
.It Sy zfs_resilver_min_time_ms Ns = Ns Sy 1500 Ns ms Pq uint
Resilvers are processed by the sync thread.
While resilvering, it will spend at least this much time
working on a resilver between TXG flushes.
@@ -2112,7 +2133,7 @@ in order to verify the checksums of all blocks which have been
copied during the expansion.
This is enabled by default and strongly recommended.
.
.It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint
.It Sy zfs_scrub_min_time_ms Ns = Ns Sy 750 Ns ms Pq uint
Scrubs are processed by the sync thread.
While scrubbing, it will spend at least this much time
working on a scrub between TXG flushes.
+11
View File
@@ -84,6 +84,11 @@
.Op Fl K Ar key
.Ar dataset path destination
.Nm
.Fl r
.Fl O
.Op Fl K Ar key
.Ar dataset object-id destination
.Nm
.Fl R
.Op Fl A
.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
@@ -376,6 +381,12 @@ Specified
.Ar path
must be relative to the root of
.Ar dataset .
When used with
.Fl O ,
the
.Ar path
argument is interpreted as an object identifier,
not a path.
This option can be combined with
.Fl v
for increasing verbosity.
+1
View File
@@ -433,6 +433,7 @@ ZFS_OBJS := \
ZFS_OBJS_OS := \
abd_os.o \
arc_os.o \
kasan_compat.o \
mmp_os.o \
policy.o \
qat.o \
@@ -121,11 +121,12 @@ zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
}
/*
* If we need to grow the block size then lock the whole file range.
* If we might grow the block size then lock the whole file range.
* NB: this test should match the check in zfs_grow_blocksize
*/
uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
if (zp->z_size <= zp->z_blksz && end_size > zp->z_blksz &&
(!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
new->lr_offset = 0;
new->lr_length = UINT64_MAX;
}
@@ -888,6 +888,14 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount)
}
}
#ifndef nth_page
/*
* Since 6.18 nth_page() no longer exists, and is no longer required to iterate
* within a single SG entry, so we replace it with a simple addition.
*/
#define nth_page(p, n) ((p)+(n))
#endif
/*
* Map the current chunk into aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
@@ -915,7 +923,14 @@ abd_iter_map(struct abd_iter *aiter)
aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
paddr = zfs_kmap_local(sg_page(aiter->iter_sg));
struct page *page = sg_page(aiter->iter_sg);
if (PageHighMem(page)) {
page = nth_page(page, offset / PAGE_SIZE);
offset &= PAGE_SIZE - 1;
aiter->iter_mapsize = MIN(aiter->iter_mapsize,
PAGE_SIZE - offset);
}
paddr = zfs_kmap_local(page);
}
aiter->iter_mapaddr = (char *)paddr + offset;
@@ -933,8 +948,14 @@ abd_iter_unmap(struct abd_iter *aiter)
return;
if (!abd_is_linear(aiter->iter_abd)) {
size_t offset = aiter->iter_offset;
struct page *page = sg_page(aiter->iter_sg);
if (PageHighMem(page))
offset &= PAGE_SIZE - 1;
/* LINTED E_FUNC_SET_NOT_USED */
zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset);
zfs_kunmap_local(aiter->iter_mapaddr - offset);
}
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
@@ -1110,14 +1131,6 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE)
#endif
#ifndef nth_page
/*
* Since 6.18 nth_page() no longer exists, and is no longer required to iterate
* within a single SG entry, so we replace it with a simple addition.
*/
#define nth_page(p, n) ((p)+(n))
#endif
void
abd_iter_page(struct abd_iter *aiter)
{
@@ -0,0 +1,48 @@
// SPDX-License-Identifier: CDDL-1.0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
#ifndef _ZFS_LINUX_KASAN_ENABLED_H
#define _ZFS_LINUX_KASAN_ENABLED_H
#ifdef HAVE_KASAN_ENABLED_GPL_ONLY
/*
* The kernel supports a runtime setting to enable/disable KASAN. The control
* flag kasan_flag_enabled is a GPL-only symbol, which prevents us from
* accessing it. Unfortunately, this is called by the header function
* kasan_enabled(), which in turn is used to call or skip instrumentation
* functions in various header-based kernel facilities. If we inadvertently
* call one, the build breaks.
*
* To work around this, we define our own `kasan_flag_enabled` set to "false",
* disabling use of KASAN inside our code. The linker will resolve this symbol
* at build time, and so never need to reach out to the off-limits kernel
* symbol.
*/
#include <linux/static_key.h>
struct static_key_false kasan_flag_enabled = STATIC_KEY_FALSE_INIT;
#endif
#endif
@@ -117,13 +117,17 @@ static int zfs_snapshot_no_setuid = 0;
typedef struct {
char *se_name; /* full snapshot name */
char *se_path; /* full mount path */
spa_t *se_spa; /* pool spa */
spa_t *se_spa; /* pool spa (NULL if pending) */
uint64_t se_objsetid; /* snapshot objset id */
struct dentry *se_root_dentry; /* snapshot root dentry */
taskqid_t se_taskqid; /* scheduled unmount taskqid */
avl_node_t se_node_name; /* zfs_snapshots_by_name link */
avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
zfs_refcount_t se_refcount; /* reference count */
kmutex_t se_mtx; /* protects se_mounting and se_cv */
kcondvar_t se_cv; /* signal mount completion */
boolean_t se_mounting; /* mount operation in progress */
int se_mount_error; /* error from failed mount */
} zfs_snapentry_t;
static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
@@ -146,6 +150,10 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
se->se_objsetid = objsetid;
se->se_root_dentry = root_dentry;
se->se_taskqid = TASKQID_INVALID;
mutex_init(&se->se_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&se->se_cv, NULL, CV_DEFAULT, NULL);
se->se_mounting = B_FALSE;
se->se_mount_error = 0;
zfs_refcount_create(&se->se_refcount);
@@ -162,6 +170,8 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
zfs_refcount_destroy(&se->se_refcount);
kmem_strfree(se->se_name);
kmem_strfree(se->se_path);
mutex_destroy(&se->se_mtx);
cv_destroy(&se->se_cv);
kmem_free(se, sizeof (zfs_snapentry_t));
}
@@ -187,9 +197,9 @@ zfsctl_snapshot_rele(zfs_snapentry_t *se)
}
/*
* Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
* zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
* of the trees a reference is held.
* Add a zfs_snapentry_t to the zfs_snapshots_by_name tree. If the entry
* is not pending (se_spa != NULL), also add to zfs_snapshots_by_objsetid.
* While the zfs_snapentry_t is part of the trees a reference is held.
*/
static void
zfsctl_snapshot_add(zfs_snapentry_t *se)
@@ -197,24 +207,42 @@ zfsctl_snapshot_add(zfs_snapentry_t *se)
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
zfsctl_snapshot_hold(se);
avl_add(&zfs_snapshots_by_name, se);
avl_add(&zfs_snapshots_by_objsetid, se);
if (se->se_spa != NULL)
avl_add(&zfs_snapshots_by_objsetid, se);
}
/*
* Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
* zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
* this can result in the structure being freed if that was the last
* remaining reference.
* Remove a zfs_snapentry_t from the zfs_snapshots_by_name tree and
* zfs_snapshots_by_objsetid tree (if not pending). Upon removal a
* reference is dropped, this can result in the structure being freed
* if that was the last remaining reference.
*/
static void
zfsctl_snapshot_remove(zfs_snapentry_t *se)
{
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
avl_remove(&zfs_snapshots_by_name, se);
avl_remove(&zfs_snapshots_by_objsetid, se);
if (se->se_spa != NULL)
avl_remove(&zfs_snapshots_by_objsetid, se);
zfsctl_snapshot_rele(se);
}
/*
* Fill a pending zfs_snapentry_t after mount succeeds. Fills in the
* remaining fields and adds the entry to the zfs_snapshots_by_objsetid tree.
*/
static void
zfsctl_snapshot_fill(zfs_snapentry_t *se, spa_t *spa, uint64_t objsetid,
struct dentry *root_dentry)
{
ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
ASSERT3P(se->se_spa, ==, NULL);
se->se_spa = spa;
se->se_objsetid = objsetid;
se->se_root_dentry = root_dentry;
avl_add(&zfs_snapshots_by_objsetid, se);
}
/*
* Snapshot name comparison function for the zfs_snapshots_by_name.
*/
@@ -312,6 +340,11 @@ zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
se = zfsctl_snapshot_find_by_name(old_snapname);
if (se == NULL)
return (SET_ERROR(ENOENT));
if (se->se_spa == NULL) {
/* Snapshot mount is in progress */
zfsctl_snapshot_rele(se);
return (SET_ERROR(EBUSY));
}
zfsctl_snapshot_remove(se);
kmem_strfree(se->se_name);
@@ -430,26 +463,6 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
return (error);
}
/*
* Check if snapname is currently mounted. Returned non-zero when mounted
* and zero when unmounted.
*/
static boolean_t
zfsctl_snapshot_ismounted(const char *snapname)
{
zfs_snapentry_t *se;
boolean_t ismounted = B_FALSE;
rw_enter(&zfs_snapshot_lock, RW_READER);
if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
zfsctl_snapshot_rele(se);
ismounted = B_TRUE;
}
rw_exit(&zfs_snapshot_lock);
return (ismounted);
}
/*
* Check if the given inode is a part of the virtual .zfs directory.
*/
@@ -1131,6 +1144,14 @@ zfsctl_snapshot_unmount(const char *snapname, int flags)
}
rw_exit(&zfs_snapshot_lock);
/*
* Wait for any pending auto-mount to complete before unmounting.
*/
mutex_enter(&se->se_mtx);
while (se->se_mounting)
cv_wait(&se->se_cv, &se->se_mtx);
mutex_exit(&se->se_mtx);
exportfs_flush();
if (flags & MNT_FORCE)
@@ -1232,14 +1253,35 @@ zfsctl_snapshot_mount(struct path *path, int flags)
zfs_snapshot_no_setuid ? "nosuid" : "suid");
/*
* Multiple concurrent automounts of a snapshot are never allowed.
* The snapshot may be manually mounted as many times as desired.
* Check if snapshot is already being mounted. If found, wait for
* pending mount to complete before returning success.
*/
if (zfsctl_snapshot_ismounted(full_name)) {
error = 0;
rw_enter(&zfs_snapshot_lock, RW_WRITER);
if ((se = zfsctl_snapshot_find_by_name(full_name)) != NULL) {
rw_exit(&zfs_snapshot_lock);
mutex_enter(&se->se_mtx);
while (se->se_mounting)
cv_wait(&se->se_cv, &se->se_mtx);
/*
* Return the same error as the first mount attempt (0 if
* succeeded, error code if failed).
*/
error = se->se_mount_error;
mutex_exit(&se->se_mtx);
zfsctl_snapshot_rele(se);
goto error;
}
/*
* Create pending entry and mark mount in progress.
*/
se = zfsctl_snapshot_alloc(full_name, full_path, NULL, 0, NULL);
se->se_mounting = B_TRUE;
zfsctl_snapshot_add(se);
zfsctl_snapshot_hold(se);
rw_exit(&zfs_snapshot_lock);
/*
* Attempt to mount the snapshot from user space. Normally this
* would be done using the vfs_kern_mount() function, however that
@@ -1258,6 +1300,9 @@ zfsctl_snapshot_mount(struct path *path, int flags)
argv[9] = full_path;
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
if (error) {
/*
* Mount failed - cleanup pending entry and signal waiters.
*/
if (!(error & MOUNT_BUSY << 8)) {
zfs_dbgmsg("Unable to automount %s error=%d",
full_path, error);
@@ -1273,6 +1318,16 @@ zfsctl_snapshot_mount(struct path *path, int flags)
*/
error = 0;
}
rw_enter(&zfs_snapshot_lock, RW_WRITER);
zfsctl_snapshot_remove(se);
rw_exit(&zfs_snapshot_lock);
mutex_enter(&se->se_mtx);
se->se_mount_error = error;
se->se_mounting = B_FALSE;
cv_broadcast(&se->se_cv);
mutex_exit(&se->se_mtx);
zfsctl_snapshot_rele(se);
goto error;
}
@@ -1289,14 +1344,25 @@ zfsctl_snapshot_mount(struct path *path, int flags)
spath.mnt->mnt_flags |= MNT_SHRINKABLE;
rw_enter(&zfs_snapshot_lock, RW_WRITER);
se = zfsctl_snapshot_alloc(full_name, full_path,
snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
dentry);
zfsctl_snapshot_add(se);
zfsctl_snapshot_fill(se, snap_zfsvfs->z_os->os_spa,
dmu_objset_id(snap_zfsvfs->z_os), dentry);
zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
rw_exit(&zfs_snapshot_lock);
} else {
rw_enter(&zfs_snapshot_lock, RW_WRITER);
zfsctl_snapshot_remove(se);
rw_exit(&zfs_snapshot_lock);
}
path_put(&spath);
/*
* Signal mount completion and cleanup.
*/
mutex_enter(&se->se_mtx);
se->se_mounting = B_FALSE;
cv_broadcast(&se->se_cv);
mutex_exit(&se->se_mtx);
zfsctl_snapshot_rele(se);
error:
kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(full_path, MAXPATHLEN);
@@ -100,15 +100,17 @@ zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
while (n && uio->uio_resid) {
void *paddr;
cnt = MIN(bv->bv_len - skip, n);
size_t offset = bv->bv_offset + skip;
cnt = MIN(PAGE_SIZE - (offset & ~PAGE_MASK),
MIN(bv->bv_len - skip, n));
paddr = zfs_kmap_local(bv->bv_page);
paddr = zfs_kmap_local(bv->bv_page + (offset >> PAGE_SHIFT));
if (rw == UIO_READ) {
/* Copy from buffer 'p' to the bvec data */
memcpy(paddr + bv->bv_offset + skip, p, cnt);
memcpy(paddr + (offset & ~PAGE_MASK), p, cnt);
} else {
/* Copy from bvec data to buffer 'p' */
memcpy(p, paddr + bv->bv_offset + skip, cnt);
memcpy(p, paddr + (offset & ~PAGE_MASK), cnt);
}
zfs_kunmap_local(paddr);
@@ -31,7 +31,6 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -95,11 +95,12 @@ zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
}
/*
* If we need to grow the block size then lock the whole file range.
* If we might grow the block size then lock the whole file range.
* NB: this test should match the check in zfs_grow_blocksize
*/
uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
if (zp->z_size <= zp->z_blksz && end_size > zp->z_blksz &&
(!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
new->lr_offset = 0;
new->lr_length = UINT64_MAX;
}
+8 -8
View File
@@ -1111,13 +1111,6 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
func_raidz_gen(caddrs, daddr, len, dlen);
for (i = parity-1; i >= 0; i--) {
abd_iter_unmap(&caiters[i]);
c_cabds[i] =
abd_advance_abd_iter(cabds[i], c_cabds[i],
&caiters[i], len);
}
if (dsize > 0) {
abd_iter_unmap(&daiter);
c_dabd =
@@ -1126,6 +1119,13 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
dsize -= dlen;
}
for (i = parity - 1; i >= 0; i--) {
abd_iter_unmap(&caiters[i]);
c_cabds[i] =
abd_advance_abd_iter(cabds[i], c_cabds[i],
&caiters[i], len);
}
csize -= len;
}
abd_exit_critical(flags);
@@ -1194,7 +1194,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
func_raidz_rec(xaddrs, len, caddrs, mul);
for (i = parity-1; i >= 0; i--) {
for (i = parity - 1; i >= 0; i--) {
abd_iter_unmap(&xiters[i]);
abd_iter_unmap(&citers[i]);
c_tabds[i] =
+53 -40
View File
@@ -371,6 +371,12 @@ static uint_t zfs_arc_eviction_pct = 200;
*/
static uint_t zfs_arc_evict_batch_limit = 10;
/*
* Number batches to process per parallel eviction task under heavy load to
* reduce number of context switches.
*/
static uint_t zfs_arc_evict_batches_limit = 5;
/* number of seconds before growing cache again */
uint_t arc_grow_retry = 5;
@@ -406,8 +412,8 @@ uint_t arc_no_grow_shift = 5;
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
static uint_t arc_min_prefetch_ms;
static uint_t arc_min_prescient_prefetch_ms;
static uint_t arc_min_prefetch;
static uint_t arc_min_prescient_prefetch;
/*
* If this percent of memory is free, don't throttle.
@@ -3766,8 +3772,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3824,9 +3828,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
/* prefetch buffers have a minimum lifespan */
uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
arc_min_prescient_prefetch : arc_min_prefetch;
if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
MSEC_TO_TICK(min_lifetime)) {
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
@@ -3900,7 +3905,7 @@ arc_set_need_free(void)
static uint64_t
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, uint64_t bytes)
uint64_t spa, uint64_t bytes, boolean_t *more)
{
multilist_sublist_t *mls;
uint64_t bytes_evicted = 0, real_evicted = 0;
@@ -3984,6 +3989,10 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
multilist_sublist_unlock(mls);
/* Indicate if another iteration may be productive. */
if (more)
*more = (hdr != NULL);
/*
* Increment the count of evicted bytes, and wake up any threads that
* are waiting for the count to reach this value. Since the list is
@@ -4004,21 +4013,12 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
while ((aw = list_head(&arc_evict_waiters)) != NULL &&
aw->aew_count <= arc_evict_count) {
list_remove(&arc_evict_waiters, aw);
cv_broadcast(&aw->aew_cv);
cv_signal(&aw->aew_cv);
}
}
arc_set_need_free();
mutex_exit(&arc_evict_lock);
/*
* If the ARC size is reduced from arc_c_max to arc_c_min (especially
* if the average cached block is small), eviction can be on-CPU for
* many seconds. To ensure that other threads that may be bound to
* this CPU are able to make progress, make a voluntary preemption
* call here.
*/
kpreempt(KPREEMPT_SYNC);
return (bytes_evicted);
}
@@ -4079,8 +4079,18 @@ static void
arc_evict_task(void *arg)
{
evict_arg_t *eva = arg;
eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
eva->eva_marker, eva->eva_spa, eva->eva_bytes);
uint64_t total_evicted = 0;
boolean_t more;
uint_t batches = zfs_arc_evict_batches_limit;
/* Process multiple batches to amortize taskq dispatch overhead. */
do {
total_evicted += arc_evict_state_impl(eva->eva_ml,
eva->eva_idx, eva->eva_marker, eva->eva_spa,
eva->eva_bytes - total_evicted, &more);
} while (total_evicted < eva->eva_bytes && --batches > 0 && more);
eva->eva_evicted = total_evicted;
}
static void
@@ -4221,18 +4231,19 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
if (bytes == ARC_EVICT_ALL) {
evict = bytes;
} else if (left > ntasks * MIN_EVICT_SIZE) {
} else if (left >= ntasks * MIN_EVICT_SIZE) {
evict = DIV_ROUND_UP(left, ntasks);
} else {
ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
if (ntasks == 1)
ntasks = left / MIN_EVICT_SIZE;
if (ntasks < 2)
use_evcttq = B_FALSE;
else
evict = DIV_ROUND_UP(left, ntasks);
}
}
for (int i = 0; sublists_left > 0; i++, sublist_idx++,
sublists_left--) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;
/* we've reached the end, wrap to the beginning */
@@ -4254,16 +4265,17 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
continue;
}
if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
break;
bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
markers[sublist_idx], spa, bytes_remaining);
markers[sublist_idx], spa, bytes - total_evicted,
NULL);
scan_evicted += bytes_evicted;
total_evicted += bytes_evicted;
if (total_evicted < bytes)
kpreempt(KPREEMPT_SYNC);
else
break;
}
if (use_evcttq) {
@@ -4838,8 +4850,7 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
*/
return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
ddi_get_lbolt() - arc_last_uncached_flush >
MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
ddi_get_lbolt() - arc_last_uncached_flush > arc_min_prefetch / 2));
}
/*
@@ -4889,7 +4900,7 @@ arc_evict_cb(void *arg, zthr_t *zthr)
*/
arc_evict_waiter_t *aw;
while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
cv_broadcast(&aw->aew_cv);
cv_signal(&aw->aew_cv);
}
arc_set_need_free();
}
@@ -5170,9 +5181,8 @@ arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve)
uint64_t last_count = 0;
mutex_enter(&arc_evict_lock);
if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters);
arc_evict_waiter_t *last;
if ((last = list_tail(&arc_evict_waiters)) != NULL) {
last_count = last->aew_count;
} else if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
@@ -7593,12 +7603,12 @@ arc_tuning_update(boolean_t verbose)
/* Valid range: 1 - N ms */
if (zfs_arc_min_prefetch_ms)
arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
arc_min_prefetch = MSEC_TO_TICK(zfs_arc_min_prefetch_ms);
/* Valid range: 1 - N ms */
if (zfs_arc_min_prescient_prefetch_ms) {
arc_min_prescient_prefetch_ms =
zfs_arc_min_prescient_prefetch_ms;
arc_min_prescient_prefetch =
MSEC_TO_TICK(zfs_arc_min_prescient_prefetch_ms);
}
/* Valid range: 0 - 100 */
@@ -7982,8 +7992,8 @@ arc_init(void)
list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
offsetof(arc_evict_waiter_t, aew_node));
arc_min_prefetch_ms = 1000;
arc_min_prescient_prefetch_ms = 6000;
arc_min_prefetch = MSEC_TO_TICK(1000);
arc_min_prescient_prefetch = MSEC_TO_TICK(6000);
#if defined(_KERNEL)
arc_lowmem_init();
@@ -11290,6 +11300,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
"The number of headers to evict per sublist before moving to the next");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batches_limit, UINT, ZMOD_RW,
"The number of batches to run per parallel eviction task");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
"Number of arc_prune threads");
+103 -66
View File
@@ -407,6 +407,9 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
ASSERT3U(*objectp, !=, 0);
VERIFY0(dnode_hold(os, *objectp, ddt,
&ddt->ddt_object_dnode[type][class]));
ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
@@ -437,6 +440,10 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
VERIFY0(count);
VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
if (ddt->ddt_object_dnode[type][class] != NULL) {
dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
ddt->ddt_object_dnode[type][class] = NULL;
}
VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
@@ -468,28 +475,38 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
if (error != 0)
return (error);
error = dnode_hold(ddt->ddt_os, ddt->ddt_object[type][class], ddt,
&ddt->ddt_object_dnode[type][class]);
if (error != 0)
return (error);
error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
&ddt->ddt_histogram[type][class]);
if (error != 0)
return (error);
goto error;
/*
* Seed the cached statistics.
*/
error = ddt_object_info(ddt, type, class, &doi);
if (error)
return (error);
goto error;
error = ddt_object_count(ddt, type, class, &count);
if (error)
return (error);
goto error;
ddo->ddo_count = count;
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
return (0);
error:
dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
ddt->ddt_object_dnode[type][class] = NULL;
return (error);
}
static void
@@ -528,11 +545,11 @@ static int
ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ddt_entry_t *dde)
{
if (!ddt_object_exists(ddt, type, class))
dnode_t *dn = ddt->ddt_object_dnode[type][class];
if (dn == NULL)
return (SET_ERROR(ENOENT));
return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key,
return (ddt_ops[type]->ddt_op_lookup(dn, &dde->dde_key,
dde->dde_phys, DDT_PHYS_SIZE(ddt)));
}
@@ -540,42 +557,42 @@ static int
ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
const ddt_key_t *ddk)
{
if (!ddt_object_exists(ddt, type, class))
dnode_t *dn = ddt->ddt_object_dnode[type][class];
if (dn == NULL)
return (SET_ERROR(ENOENT));
return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
ddt->ddt_object[type][class], ddk));
return (ddt_ops[type]->ddt_op_contains(dn, ddk));
}
static void
ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
const ddt_key_t *ddk)
{
if (!ddt_object_exists(ddt, type, class))
dnode_t *dn = ddt->ddt_object_dnode[type][class];
if (dn == NULL)
return;
ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
ddt->ddt_object[type][class], ddk);
ddt_ops[type]->ddt_op_prefetch(dn, ddk);
}
static void
ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
{
if (!ddt_object_exists(ddt, type, class))
dnode_t *dn = ddt->ddt_object_dnode[type][class];
if (dn == NULL)
return;
ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os,
ddt->ddt_object[type][class]);
ddt_ops[type]->ddt_op_prefetch_all(dn);
}
static int
ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{
ASSERT(ddt_object_exists(ddt, type, class));
dnode_t *dn = ddt->ddt_object_dnode[type][class];
ASSERT(dn != NULL);
return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
return (ddt_ops[type]->ddt_op_update(dn, &ddlwe->ddlwe_key,
&ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
}
@@ -583,20 +600,20 @@ static int
ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
const ddt_key_t *ddk, dmu_tx_t *tx)
{
ASSERT(ddt_object_exists(ddt, type, class));
dnode_t *dn = ddt->ddt_object_dnode[type][class];
ASSERT(dn != NULL);
return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
ddt->ddt_object[type][class], ddk, tx));
return (ddt_ops[type]->ddt_op_remove(dn, ddk, tx));
}
int
ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
{
ASSERT(ddt_object_exists(ddt, type, class));
dnode_t *dn = ddt->ddt_object_dnode[type][class];
ASSERT(dn != NULL);
int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
int error = ddt_ops[type]->ddt_op_walk(dn, walk, &ddlwe->ddlwe_key,
&ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
if (error == 0) {
ddlwe->ddlwe_type = type;
@@ -610,10 +627,10 @@ int
ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
uint64_t *count)
{
ASSERT(ddt_object_exists(ddt, type, class));
dnode_t *dn = ddt->ddt_object_dnode[type][class];
ASSERT(dn != NULL);
return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
ddt->ddt_object[type][class], count));
return (ddt_ops[type]->ddt_op_count(dn, count));
}
int
@@ -1037,13 +1054,6 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
{
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
/* Entry is still in the log, so charge the entry back to it */
if (dde->dde_flags & DDE_FLAG_LOGGED) {
ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
}
avl_remove(&ddt->ddt_tree, dde);
ddt_free(ddt, dde);
}
@@ -1234,63 +1244,61 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
/* Time to make a new entry. */
dde = ddt_alloc(ddt, &search);
/* Record the time this class was created (used by ddt prune) */
if (ddt->ddt_flags & DDT_FLAG_FLAT)
dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
avl_insert(&ddt->ddt_tree, dde, where);
/* If its in the log tree, we can "load" it from there */
/*
* The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
* threads will wait even while we drop the lock.
*/
ddt_exit(ddt);
/*
* If there is a log, we should try to "load" from there first.
*/
if (ddt->ddt_flags & DDT_FLAG_LOG) {
ddt_lightweight_entry_t ddlwe;
boolean_t from_flushing;
if (ddt_log_find_key(ddt, &search, &ddlwe)) {
/*
* See if we have the key first, and if so, set up
* the entry.
*/
/* Read-only search, no locks needed (logs stable during I/O) */
if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
dde->dde_type = ddlwe.ddlwe_type;
dde->dde_class = ddlwe.ddlwe_class;
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
DDT_PHYS_SIZE(ddt));
/* Whatever we found isn't valid for this BP, eject */
if (verify &&
!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
/*
* Check validity. If invalid and no waiters, clean up
* immediately. Otherwise continue setup for waiters.
*/
boolean_t valid = !verify ||
ddt_entry_lookup_is_valid(ddt, bp, dde);
ddt_enter(ddt);
if (!valid && dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(ddt, dde);
return (NULL);
}
/* Remove it and count it */
if (ddt_log_remove_key(ddt,
ddt->ddt_log_active, &search)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
} else {
VERIFY(ddt_log_remove_key(ddt,
ddt->ddt_log_flushing, &search));
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
if (from_flushing) {
dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
DDT_KSTAT_BUMP(ddt,
dds_lookup_log_flushing_hit);
} else {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
}
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
return (dde);
cv_broadcast(&dde->dde_cv);
return (valid ? dde : NULL);
}
DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
}
/*
* ddt_tree is now stable, so unlock and let everyone else keep moving.
* Anyone landing on this entry will find it without DDE_FLAG_LOADED,
* and go to sleep waiting for it above.
*/
ddt_exit(ddt);
/* Search all store objects for the entry. */
error = ENOENT;
for (type = 0; type < DDT_TYPES; type++) {
@@ -1727,6 +1735,15 @@ ddt_table_free(ddt_t *ddt)
wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_miss);
ddt_log_free(ddt);
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
if (ddt->ddt_object_dnode[type][class] != NULL) {
dnode_rele(ddt->ddt_object_dnode[type][class],
ddt);
ddt->ddt_object_dnode[type][class] = NULL;
}
}
}
ASSERT0(avl_numnodes(&ddt->ddt_tree));
ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
avl_destroy(&ddt->ddt_tree);
@@ -2354,6 +2371,19 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
/* If from flushing log, remove it. */
if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
VERIFY(ddt_log_remove_key(ddt,
ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
}
/* Update class_start to track last modification time */
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
ddt_class_start();
}
ddt_log_entry(ddt, &ddlwe, &dlu);
ddt_sync_scan_entry(ddt, &ddlwe, tx);
ddt_free(ddt, dde);
@@ -2414,6 +2444,13 @@ ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
/* Update class_start to track last modification time */
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
ddt_class_start();
}
ddt_sync_flush_entry(ddt, &ddlwe,
dde->dde_type, dde->dde_class, tx);
ddt_sync_scan_entry(ddt, &ddlwe, tx);
@@ -2765,7 +2802,7 @@ ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
* If this entry is on the log, then the stored entry is stale
* and we should skip it.
*/
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
continue;
/* prune older entries */
+22 -10
View File
@@ -252,7 +252,8 @@ ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
}
static void
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,
boolean_t hist)
{
/* Create the log tree entry from a live or stored entry */
avl_index_t where;
@@ -262,7 +263,13 @@ ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
ddle = ddt_log_alloc_entry(ddt);
ddle->ddle_key = ddlwe->ddlwe_key;
avl_insert(&ddl->ddl_tree, ddle, where);
} else if (hist) {
ddt_lightweight_entry_t oddlwe;
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);
}
if (hist)
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
ddle->ddle_type = ddlwe->ddlwe_type;
ddle->ddle_class = ddlwe->ddlwe_class;
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
@@ -273,8 +280,7 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
{
ASSERT3U(dlu->dlu_dbp, !=, NULL);
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);
/* Get our block */
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
@@ -381,14 +387,20 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
boolean_t
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe)
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)
{
ddt_log_entry_t *ddle =
avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
if (!ddle)
ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,
ddk, NULL);
if (ddle) {
if (from_flushing)
*from_flushing = B_FALSE;
} else {
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
if (!ddle)
return (B_FALSE);
if (!ddle)
return (B_FALSE);
if (from_flushing)
*from_flushing = B_TRUE;
}
if (ddlwe)
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
return (B_TRUE);
@@ -524,7 +536,7 @@ ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
ddlwe.ddlwe_key = dlre->dlre_key;
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
ddt_log_update_entry(ddt, ddl, &ddlwe);
ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);
}
static void
+33 -37
View File
@@ -33,6 +33,7 @@
#include <sys/ddt_impl.h>
#include <sys/zap.h>
#include <sys/dmu_tx.h>
#include <sys/dnode.h>
#include <sys/zio_compress.h>
static unsigned int ddt_zap_default_bs = 15;
@@ -56,7 +57,7 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
/* Call compress function directly to avoid hole detection. */
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, (void *)src, s_len);
abd_get_from_buf_struct(&dabd, dst, d_len);
abd_get_from_buf_struct(&dabd, dst, d_len - 1);
c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
abd_free(&dabd);
abd_free(&sabd);
@@ -85,9 +86,10 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
}
abd_t sabd, dabd;
abd_get_from_buf_struct(&sabd, src, s_len);
size_t c_len = s_len - 1;
abd_get_from_buf_struct(&sabd, src, c_len);
abd_get_from_buf_struct(&dabd, dst, d_len);
VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, c_len, d_len, NULL));
abd_free(&dabd);
abd_free(&sabd);
@@ -120,54 +122,48 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
}
static int
ddt_zap_lookup(objset_t *os, uint64_t object,
const ddt_key_t *ddk, void *phys, size_t psize)
ddt_zap_lookup(dnode_t *dn, const ddt_key_t *ddk, void *phys, size_t psize)
{
uchar_t *cbuf;
uint64_t one, csize;
uint64_t csize;
int error;
error = zap_length_uint64(os, object, (uint64_t *)ddk,
DDT_KEY_WORDS, &one, &csize);
if (error)
return (error);
cbuf = kmem_alloc(psize + 1, KM_SLEEP);
ASSERT3U(one, ==, 1);
ASSERT3U(csize, <=, psize + 1);
cbuf = kmem_alloc(csize, KM_SLEEP);
error = zap_lookup_uint64(os, object, (uint64_t *)ddk,
DDT_KEY_WORDS, 1, csize, cbuf);
if (error == 0)
error = zap_lookup_length_uint64_by_dnode(dn, (uint64_t *)ddk,
DDT_KEY_WORDS, 1, psize + 1, cbuf, &csize);
if (error == 0) {
ASSERT3U(csize, <=, psize + 1);
ddt_zap_decompress(cbuf, phys, csize, psize);
}
kmem_free(cbuf, csize);
kmem_free(cbuf, psize + 1);
return (error);
}
static int
ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk)
ddt_zap_contains(dnode_t *dn, const ddt_key_t *ddk)
{
return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS,
NULL, NULL));
return (zap_length_uint64_by_dnode(dn, (uint64_t *)ddk,
DDT_KEY_WORDS, NULL, NULL));
}
static void
ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk)
ddt_zap_prefetch(dnode_t *dn, const ddt_key_t *ddk)
{
(void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS);
(void) zap_prefetch_uint64_by_dnode(dn, (uint64_t *)ddk,
DDT_KEY_WORDS);
}
static void
ddt_zap_prefetch_all(objset_t *os, uint64_t object)
ddt_zap_prefetch_all(dnode_t *dn)
{
(void) zap_prefetch_object(os, object);
(void) zap_prefetch_object(dn->dn_objset, dn->dn_object);
}
static int
ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
ddt_zap_update(dnode_t *dn, const ddt_key_t *ddk,
const void *phys, size_t psize, dmu_tx_t *tx)
{
const size_t cbuf_size = psize + 1;
@@ -176,7 +172,7 @@ ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size);
int error = zap_update_uint64(os, object, (uint64_t *)ddk,
int error = zap_update_uint64_by_dnode(dn, (uint64_t *)ddk,
DDT_KEY_WORDS, 1, csize, cbuf, tx);
kmem_free(cbuf, cbuf_size);
@@ -185,15 +181,14 @@ ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
}
static int
ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
dmu_tx_t *tx)
ddt_zap_remove(dnode_t *dn, const ddt_key_t *ddk, dmu_tx_t *tx)
{
return (zap_remove_uint64(os, object, (uint64_t *)ddk,
return (zap_remove_uint64_by_dnode(dn, (uint64_t *)ddk,
DDT_KEY_WORDS, tx));
}
static int
ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
ddt_zap_walk(dnode_t *dn, uint64_t *walk, ddt_key_t *ddk,
void *phys, size_t psize)
{
zap_cursor_t zc;
@@ -209,9 +204,10 @@ ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
* scrub I/Os for each ZAP block that we read in, so
* reading the ZAP is unlikely to be the bottleneck.
*/
zap_cursor_init_noprefetch(&zc, os, object);
zap_cursor_init_noprefetch(&zc, dn->dn_objset, dn->dn_object);
} else {
zap_cursor_init_serialized(&zc, os, object, *walk);
zap_cursor_init_serialized(&zc, dn->dn_objset, dn->dn_object,
*walk);
}
if ((error = zap_cursor_retrieve(&zc, za)) == 0) {
uint64_t csize = za->za_num_integers;
@@ -221,7 +217,7 @@ ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP);
error = zap_lookup_uint64(os, object, (uint64_t *)za->za_name,
error = zap_lookup_uint64_by_dnode(dn, (uint64_t *)za->za_name,
DDT_KEY_WORDS, 1, csize, cbuf);
ASSERT0(error);
if (error == 0) {
@@ -240,9 +236,9 @@ ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
}
static int
ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
ddt_zap_count(dnode_t *dn, uint64_t *count)
{
return (zap_count(os, object, count));
return (zap_count_by_dnode(dn, count));
}
const ddt_ops_t ddt_zap_ops = {
+4 -2
View File
@@ -161,7 +161,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT3U(BP_GET_BIRTH(bp), >,
dsl_dataset_phys(ds)->ds_prev_snap_txg);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
/* ds_dbuf is pre-dirtied in dsl_dataset_sync(). */
ASSERT(dmu_buf_is_dirty(ds->ds_dbuf, tx));
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
dsl_dataset_phys(ds)->ds_referenced_bytes += used;
@@ -274,7 +275,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
ASSERT(!ds->ds_is_snapshot);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
/* ds_dbuf is pre-dirtied in dsl_dataset_sync(). */
ASSERT(dmu_buf_is_dirty(ds->ds_dbuf, tx));
/*
* Track block for livelist, but ignore embedded blocks because
+71 -42
View File
@@ -189,16 +189,16 @@ static uint_t zfs_scan_mem_lim_fact = 20;
static uint_t zfs_scan_mem_lim_soft_fact = 20;
/* minimum milliseconds to scrub per txg */
static uint_t zfs_scrub_min_time_ms = 1000;
static uint_t zfs_scrub_min_time_ms = 750;
/* minimum milliseconds to obsolete per txg */
static uint_t zfs_obsolete_min_time_ms = 500;
/* minimum milliseconds to free per txg */
static uint_t zfs_free_min_time_ms = 1000;
static uint_t zfs_free_min_time_ms = 500;
/* minimum milliseconds to resilver per txg */
static uint_t zfs_resilver_min_time_ms = 3000;
static uint_t zfs_resilver_min_time_ms = 1500;
static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
@@ -208,7 +208,13 @@ static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
/* max number of dedup blocks to free in a single TXG */
static uint64_t zfs_max_async_dedup_frees = 100000;
static uint64_t zfs_max_async_dedup_frees = 250000;
/*
* After freeing this many async ZIOs (dedup, clone, gang blocks), wait for
* them to complete before continuing. This prevents unbounded I/O queueing.
*/
static uint64_t zfs_async_free_zio_wait_interval = 2000;
/* set to disable resilver deferring */
static int zfs_resilver_disable_defer = B_FALSE;
@@ -217,16 +223,14 @@ static int zfs_resilver_disable_defer = B_FALSE;
static uint_t zfs_resilver_defer_percent = 10;
/*
* We wait a few txgs after importing a pool to begin scanning so that
* the import / mounting code isn't held up by scrub / resilver IO.
* Unfortunately, it is a bit difficult to determine exactly how long
* this will take since userspace will trigger fs mounts asynchronously
* and the kernel will create zvol minors asynchronously. As a result,
* the value provided here is a bit arbitrary, but represents a
* reasonable estimate of how many txgs it will take to finish fully
* importing a pool
* Number of TXGs to wait after importing before starting background
* work (async destroys, scan/scrub/resilver operations). This allows
* the import command and filesystem mounts to complete quickly without
* being delayed by background activities. The value is somewhat arbitrary
* since userspace triggers filesystem mounts asynchronously, but 5 TXGs
* provides a reasonable window for import completion in most cases.
*/
#define SCAN_IMPORT_WAIT_TXGS 5
static uint_t zfs_import_defer_txgs = 5;
#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
@@ -1665,7 +1669,7 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
* or
* - the scan queue has reached its memory use limit
*/
uint64_t curr_time_ns = gethrtime();
uint64_t curr_time_ns = getlrtime();
uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
uint64_t sync_time_ns = curr_time_ns -
scn->scn_dp->dp_spa->spa_sync_starttime;
@@ -1727,7 +1731,7 @@ dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
* - the spa is shutting down because this pool is being exported
* or the machine is rebooting.
*/
uint64_t curr_time_ns = gethrtime();
uint64_t curr_time_ns = getlrtime();
uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
uint64_t sync_time_ns = curr_time_ns -
scn->scn_dp->dp_spa->spa_sync_starttime;
@@ -3239,7 +3243,7 @@ static boolean_t
scan_io_queue_check_suspend(dsl_scan_t *scn)
{
/* See comment in dsl_scan_check_suspend() */
uint64_t curr_time_ns = gethrtime();
uint64_t curr_time_ns = getlrtime();
uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
uint64_t sync_time_ns = curr_time_ns -
scn->scn_dp->dp_spa->spa_sync_starttime;
@@ -3592,12 +3596,12 @@ dsl_scan_async_block_should_pause(dsl_scan_t *scn)
}
if (zfs_max_async_dedup_frees != 0 &&
scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
scn->scn_async_frees_this_txg >= zfs_max_async_dedup_frees) {
return (B_TRUE);
}
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
elapsed_nanosecs = getlrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / (NANOSEC / 2) > zfs_txg_timeout ||
(NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa));
@@ -3614,14 +3618,32 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
return (SET_ERROR(ERESTART));
}
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0));
zio_t *zio = zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0);
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
scn->scn_visited_this_txg++;
if (BP_GET_DEDUP(bp))
scn->scn_dedup_frees_this_txg++;
if (zio != NULL) {
/*
* zio_free_sync() returned a ZIO, meaning this is an
* async I/O (dedup, clone or gang block).
*/
scn->scn_async_frees_this_txg++;
zio_nowait(zio);
/*
* After issuing N async ZIOs, wait for them to complete.
* This makes time limits work with actual I/O completion
* times, not just queuing times.
*/
uint64_t i = zfs_async_free_zio_wait_interval;
if (i != 0 && (scn->scn_async_frees_this_txg % i) == 0) {
VERIFY0(zio_wait(scn->scn_zio_root));
scn->scn_zio_root = zio_root(scn->scn_dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
}
}
return (0);
}
@@ -3865,10 +3887,10 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
"free_bpobj/bptree on %s in txg %llu; err=%u",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
NSEC2MSEC(getlrtime() - scn->scn_sync_start_time),
spa->spa_name, (longlong_t)tx->tx_txg, err);
scn->scn_visited_this_txg = 0;
scn->scn_dedup_frees_this_txg = 0;
scn->scn_async_frees_this_txg = 0;
/*
* Write out changes to the DDT and the BRT that may be required
@@ -4196,14 +4218,14 @@ dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
spa->spa_scrub_active = B_TRUE;
scn->scn_sync_start_time = gethrtime();
scn->scn_sync_start_time = getlrtime();
/*
* zfs_scan_suspend_progress can be set to disable scrub progress.
* See more detailed comment in dsl_scan_sync().
*/
if (zfs_scan_suspend_progress) {
uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
uint64_t scan_time_ns = getlrtime() - scn->scn_sync_start_time;
int mintime = zfs_scrub_min_time_ms;
while (zfs_scan_suspend_progress &&
@@ -4211,7 +4233,7 @@ dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
!spa_shutting_down(scn->scn_dp->dp_spa) &&
NSEC2MSEC(scan_time_ns) < mintime) {
delay(hz);
scan_time_ns = gethrtime() - scn->scn_sync_start_time;
scan_time_ns = getlrtime() - scn->scn_sync_start_time;
}
return;
}
@@ -4394,6 +4416,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (spa_shutting_down(spa))
return;
/*
* Wait a few txgs after importing before doing background work
* (async destroys and scanning). This should help the import
* command to complete quickly.
*/
if (spa->spa_syncing_txg < spa->spa_first_txg + zfs_import_defer_txgs)
return;
/*
* If the scan is inactive due to a stalled async destroy, try again.
*/
@@ -4402,7 +4432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
/* reset scan statistics */
scn->scn_visited_this_txg = 0;
scn->scn_dedup_frees_this_txg = 0;
scn->scn_async_frees_this_txg = 0;
scn->scn_holes_this_txg = 0;
scn->scn_lt_min_this_txg = 0;
scn->scn_gt_max_this_txg = 0;
@@ -4413,7 +4443,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
scn->scn_avg_zio_size_this_txg = 0;
scn->scn_zios_this_txg = 0;
scn->scn_suspending = B_FALSE;
scn->scn_sync_start_time = gethrtime();
scn->scn_sync_start_time = getlrtime();
spa->spa_scrub_active = B_TRUE;
/*
@@ -4430,13 +4460,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
return;
/*
* Wait a few txgs after importing to begin scanning so that
* we can get the pool imported quickly.
*/
if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
return;
/*
* zfs_scan_suspend_progress can be set to disable scan progress.
* We don't want to spin the txg_sync thread, so we add a delay
@@ -4444,7 +4467,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
* useful for testing and debugging.
*/
if (zfs_scan_suspend_progress) {
uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
uint64_t scan_time_ns = getlrtime() - scn->scn_sync_start_time;
uint_t mintime = (scn->scn_phys.scn_func ==
POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
zfs_scrub_min_time_ms;
@@ -4454,7 +4477,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
!spa_shutting_down(scn->scn_dp->dp_spa) &&
NSEC2MSEC(scan_time_ns) < mintime) {
delay(hz);
scan_time_ns = gethrtime() - scn->scn_sync_start_time;
scan_time_ns = getlrtime() - scn->scn_sync_start_time;
}
return;
}
@@ -4584,7 +4607,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
"%llu in ddt, %llu > maxtxg)",
(longlong_t)scn->scn_visited_this_txg,
spa->spa_name,
(longlong_t)NSEC2MSEC(gethrtime() -
(longlong_t)NSEC2MSEC(getlrtime() -
scn->scn_sync_start_time),
(longlong_t)scn->scn_objsets_visited_this_txg,
(longlong_t)scn->scn_holes_this_txg,
@@ -4625,7 +4648,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)scn->scn_zios_this_txg,
spa->spa_name,
(longlong_t)scn->scn_segs_this_txg,
(longlong_t)NSEC2MSEC(gethrtime() -
(longlong_t)NSEC2MSEC(getlrtime() -
scn->scn_sync_start_time),
(longlong_t)scn->scn_avg_zio_size_this_txg,
(longlong_t)scn->scn_avg_seg_size_this_txg);
@@ -5319,7 +5342,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
"Max number of blocks freed in one txg");
ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
"Max number of dedup blocks freed in one txg");
"Max number of dedup, clone or gang blocks freed in one txg");
ZFS_MODULE_PARAM(zfs, zfs_, async_free_zio_wait_interval, U64, ZMOD_RW,
"Wait for pending free I/Os after issuing this many asynchronously");
ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
"Enable processing of the free_bpobj");
@@ -5336,6 +5362,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
"Scrub using legacy non-sequential method");
ZFS_MODULE_PARAM(zfs, zfs_, import_defer_txgs, UINT, ZMOD_RW,
"Number of TXGs to defer background work after pool import");
ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
"Scan progress on-disk checkpointing interval");
+1 -1
View File
@@ -10449,7 +10449,7 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_t *dp = spa->spa_dsl_pool;
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
spa->spa_sync_starttime = gethrtime();
spa->spa_sync_starttime = getlrtime();
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+1 -1
View File
@@ -720,7 +720,7 @@ spa_deadman(void *arg)
return;
zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
(getlrtime() - spa->spa_sync_starttime) / NANOSEC,
(u_longlong_t)++spa->spa_deadman_calls);
if (zfs_deadman_enabled)
vdev_deadman(spa->spa_root_vdev, FTAG);
@@ -2703,16 +2703,6 @@ vdev_raidz_io_start(zio_t *zio)
next_offset = synced_offset;
}
}
if (use_scratch) {
zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
"%lld next_offset=%lld use_scratch=%u",
zio,
zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
(long long)zio->io_offset,
(long long)synced_offset,
(long long)next_offset,
use_scratch);
}
rm = vdev_raidz_map_alloc_expanded(zio,
tvd->vdev_ashift, vdrz->vd_physical_width,
@@ -2851,8 +2841,6 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
continue;
if (abd_cmp(orig[c], rc->rc_abd) != 0) {
zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
vdev_raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -3175,10 +3163,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
*/
ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
"offset=%llx",
zio, c, rc->rc_devidx, (long long)rc->rc_offset);
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE,
+4 -1
View File
@@ -878,7 +878,8 @@ fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
int
fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *ncp)
char *realname, int rn_len, boolean_t *ncp,
uint64_t *actual_num_integers)
{
zap_leaf_t *l;
zap_entry_handle_t zeh;
@@ -898,6 +899,8 @@ fzap_lookup(zap_name_t *zn,
}
err = zap_entry_read(&zeh, integer_size, num_integers, buf);
if (err == 0 && actual_num_integers != NULL)
*actual_num_integers = zeh.zeh_num_integers;
(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
if (ncp) {
*ncp = zap_entry_normalization_conflict(&zeh,
+70 -10
View File
@@ -1049,6 +1049,24 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
return (err);
}
int
zap_count_by_dnode(dnode_t *dn, uint64_t *count)
{
zap_t *zap;
int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
if (!zap->zap_ismicro) {
err = fzap_count(zap, count);
} else {
*count = zap->zap_m.zap_num_entries;
}
zap_unlockdir(zap, FTAG);
return (err);
}
/*
* zn may be NULL; if not specified, it will be computed if needed.
* See also the comment above zap_entry_normalization_conflict().
@@ -1127,7 +1145,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
if (!zap->zap_ismicro) {
err = fzap_lookup(zn, integer_size, num_integers, buf,
realname, rn_len, ncp);
realname, rn_len, ncp, NULL);
} else {
zfs_btree_index_t idx;
mzap_ent_t *mze = mze_find(zn, &idx);
@@ -1282,8 +1300,9 @@ zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
}
static int
zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
zap_lookup_length_uint64_impl(zap_t *zap, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
uint64_t *actual_num_integers)
{
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
@@ -1292,7 +1311,7 @@ zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
}
int err = fzap_lookup(zn, integer_size, num_integers, buf,
NULL, 0, NULL);
NULL, 0, NULL, actual_num_integers);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
@@ -1308,9 +1327,9 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
num_integers, buf);
/* zap_lookup_uint64_impl() calls zap_unlockdir() */
err = zap_lookup_length_uint64_impl(zap, key, key_numints,
integer_size, num_integers, buf, NULL);
/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
return (err);
}
@@ -1324,9 +1343,26 @@ zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
num_integers, buf);
/* zap_lookup_uint64_impl() calls zap_unlockdir() */
err = zap_lookup_length_uint64_impl(zap, key, key_numints,
integer_size, num_integers, buf, NULL);
/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
return (err);
}
int
zap_lookup_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf,
uint64_t *actual_num_integers)
{
zap_t *zap;
int err =
zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_length_uint64_impl(zap, key, key_numints,
integer_size, num_integers, buf, actual_num_integers);
/* zap_lookup_length_uint64_impl() calls zap_unlockdir() */
return (err);
}
@@ -1395,6 +1431,27 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
return (err);
}
int
zap_length_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers)
{
zap_t *zap;
int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_length(zn, integer_size, num_integers);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
@@ -2003,6 +2060,7 @@ EXPORT_SYMBOL(zap_lookup);
EXPORT_SYMBOL(zap_lookup_by_dnode);
EXPORT_SYMBOL(zap_lookup_norm);
EXPORT_SYMBOL(zap_lookup_uint64);
EXPORT_SYMBOL(zap_lookup_length_uint64_by_dnode);
EXPORT_SYMBOL(zap_contains);
EXPORT_SYMBOL(zap_prefetch);
EXPORT_SYMBOL(zap_prefetch_uint64);
@@ -2016,12 +2074,14 @@ EXPORT_SYMBOL(zap_update_uint64);
EXPORT_SYMBOL(zap_update_uint64_by_dnode);
EXPORT_SYMBOL(zap_length);
EXPORT_SYMBOL(zap_length_uint64);
EXPORT_SYMBOL(zap_length_uint64_by_dnode);
EXPORT_SYMBOL(zap_remove);
EXPORT_SYMBOL(zap_remove_by_dnode);
EXPORT_SYMBOL(zap_remove_norm);
EXPORT_SYMBOL(zap_remove_uint64);
EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
EXPORT_SYMBOL(zap_count);
EXPORT_SYMBOL(zap_count_by_dnode);
EXPORT_SYMBOL(zap_value_search);
EXPORT_SYMBOL(zap_join);
EXPORT_SYMBOL(zap_join_increment);
+4 -4
View File
@@ -433,13 +433,13 @@ zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
} else {
return (B_FALSE);
}
if (quotaobj == 0 && default_quota == 0)
return (B_FALSE);
if (zfsvfs->z_replay)
return (B_FALSE);
(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id);
if (quotaobj == 0) {
if (default_quota == 0)
return (B_FALSE);
quota = default_quota;
} else {
err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
@@ -484,13 +484,13 @@ zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
} else {
return (B_FALSE);
}
if (quotaobj == 0 && default_quota == 0)
return (B_FALSE);
if (zfsvfs->z_replay)
return (B_FALSE);
(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id);
if (quotaobj == 0) {
if (default_quota == 0)
return (B_FALSE);
quota = default_quota;
} else {
err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+18 -5
View File
@@ -4067,19 +4067,21 @@ zio_ddt_write(zio_t *zio)
/*
* We need to write. We will create a new write with the copies
* property adjusted to match the number of DVAs we need to need to
* grow the DDT entry by to satisfy the request.
* property adjusted to match the number of DVAs we need to grow
* the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
zio_prop_t czp;
if (have_dvas > 0 || parent_dvas > 0) {
czp = *zp;
czp.zp_copies = need_dvas;
czp.zp_gang_copies = 0;
zp = &czp;
} else {
ASSERT3U(czp.zp_copies, ==, need_dvas);
ASSERT3U(zp->zp_copies, ==, need_dvas);
}
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -4157,6 +4159,17 @@ zio_ddt_free(zio_t *zio)
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
if (v != DDT_PHYS_NONE)
ddt_phys_decref(dde->dde_phys, v);
else
/*
* If the entry was found but the phys was not, then
* this block must have been pruned from the dedup
* table, and the entry refers to a later version of
* this data. Therefore, the caller is trying to delete
* the only stored instance of this block, and so we
* need to do a normal (not dedup) free. Clear dde so
* we fall into the block below.
*/
dde = NULL;
}
ddt_exit(ddt);
+3 -1
View File
@@ -7,7 +7,9 @@ REF="HEAD"
test_commit_bodylength()
{
length="72"
body=$(git log --no-show-signature -n 1 --pretty=%b "$REF" | grep -Ev "http(s)*://" | grep -E -m 1 ".{$((length + 1))}")
body=$(git log --no-show-signature -n 1 --pretty=%b "$REF" |
grep -Evi -e "http(s)*://" -e "signed-off-by:" -e "reviewed-by:" |
grep -E -m 1 ".{$((length + 1))}")
if [ -n "$body" ]; then
echo "error: commit message body contains line over ${length} characters"
return 1
+2 -2
View File
@@ -10,7 +10,7 @@ RET=0
# check for exec stacks
OUT=$(scanelf -qyRAF '%e %p' "$1")
if [ x"${OUT}" != x ]; then
if [ "${OUT}" != "" ]; then
RET=2
echo "The following files contain writable and executable sections"
echo " Files with such sections will not work properly (or at all!) on some"
@@ -26,7 +26,7 @@ fi
# check for TEXTRELS
OUT=$(scanelf -qyRAF '%T %p' "$1")
if [ x"${OUT}" != x ]; then
if [ "${OUT}" != "" ]; then
RET=2
echo "The following files contain runtime text relocations"
echo " Text relocations force the dynamic linker to perform extra"
@@ -706,7 +706,8 @@ tags = ['functional', 'deadman']
[tests/functional/dedup]
tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing',
'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_zap_shrink']
'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_prune_leak',
'dedup_zap_shrink']
pre =
post =
tags = ['functional', 'dedup']
@@ -1019,7 +1020,7 @@ tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
'snapshot_017_pos', 'snapshot_018_pos']
'snapshot_017_pos', 'snapshot_018_pos', 'snapshot_019_pos']
tags = ['functional', 'snapshot']
[tests/functional/snapused]
@@ -580,7 +580,7 @@ tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos',
'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos',
'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos',
'snapshot_018_pos']
'snapshot_018_pos', 'snapshot_019_pos']
tags = ['functional', 'snapshot']
[tests/functional/snapused]
@@ -36,12 +36,13 @@
#include <string.h>
#include <unistd.h>
#ifdef __FreeBSD__
#define loff_t off_t
#if defined(_GNU_SOURCE) && defined(__linux__)
_Static_assert(sizeof (loff_t) == sizeof (off_t),
"loff_t and off_t must be the same size");
#endif
ssize_t
copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int)
__attribute__((weak));
static void *
@@ -42,12 +42,13 @@
#include <sys/stat.h>
#include <sys/mman.h>
#ifdef __FreeBSD__
#define loff_t off_t
#if defined(_GNU_SOURCE) && defined(__linux__)
_Static_assert(sizeof (loff_t) == sizeof (off_t),
"loff_t and off_t must be the same size");
#endif
ssize_t
copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int)
__attribute__((weak));
static int
@@ -59,16 +59,17 @@
#endif
#endif /* __NR_copy_file_range */
#ifdef __FreeBSD__
#define loff_t off_t
#if defined(_GNU_SOURCE) && defined(__linux__)
_Static_assert(sizeof (loff_t) == sizeof (off_t),
"loff_t and off_t must be the same size");
#endif
ssize_t
copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int)
__attribute__((weak));
static inline ssize_t
cf_copy_file_range(int sfd, loff_t *soff, int dfd, loff_t *doff,
cf_copy_file_range(int sfd, off_t *soff, int dfd, off_t *doff,
size_t len, unsigned int flags)
{
if (copy_file_range)
@@ -151,9 +152,9 @@ usage(void)
}
int do_clone(int sfd, int dfd);
int do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
int do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
int do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
int do_clonerange(int sfd, int dfd, off_t soff, off_t doff, size_t len);
int do_copyfilerange(int sfd, int dfd, off_t soff, off_t doff, size_t len);
int do_deduperange(int sfd, int dfd, off_t soff, off_t doff, size_t len);
int quiet = 0;
@@ -203,7 +204,7 @@ main(int argc, char **argv)
abort();
}
loff_t soff = 0, doff = 0;
off_t soff = 0, doff = 0;
size_t len = SSIZE_MAX;
unsigned long long len2;
if ((argc-optind) == 5) {
@@ -295,7 +296,7 @@ do_clone(int sfd, int dfd)
}
int
do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
do_clonerange(int sfd, int dfd, off_t soff, off_t doff, size_t len)
{
if (!quiet)
fprintf(stderr, "using FICLONERANGE\n");
@@ -314,7 +315,7 @@ do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
}
int
do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
do_copyfilerange(int sfd, int dfd, off_t soff, off_t doff, size_t len)
{
if (!quiet)
fprintf(stderr, "using copy_file_range\n");
@@ -341,7 +342,7 @@ do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
}
int
do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
do_deduperange(int sfd, int dfd, off_t soff, off_t doff, size_t len)
{
if (!quiet)
fprintf(stderr, "using FIDEDUPERANGE\n");
@@ -3861,8 +3861,6 @@ function directory_diff # dir_a dir_b
# do not match there is a "c" entry in one of the columns).
if rsync --version | grep -q "[, ] crtimes"; then
args+=("--crtimes")
else
log_note "This rsync package does not support --crtimes (-N)."
fi
# If we are testing a ZIL replay, we need to ignore timestamp changes.
@@ -1482,6 +1482,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/dedup/dedup_legacy_fdt_upgrade.ksh \
functional/dedup/dedup_legacy_fdt_mixed.ksh \
functional/dedup/dedup_prune.ksh \
functional/dedup/dedup_prune_leak.ksh \
functional/dedup/dedup_quota.ksh \
functional/dedup/dedup_zap_shrink.ksh \
functional/delegate/cleanup.ksh \
@@ -2121,6 +2122,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/snapshot/snapshot_016_pos.ksh \
functional/snapshot/snapshot_017_pos.ksh \
functional/snapshot/snapshot_018_pos.ksh \
functional/snapshot/snapshot_019_pos.ksh \
functional/snapused/cleanup.ksh \
functional/snapused/setup.ksh \
functional/snapused/snapused_001_pos.ksh \
@@ -0,0 +1,86 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2025, Klara Inc.
# Copyright (c) 2025, Nutanix Inc.
#
# DESCRIPTION:
# Verify that zpool ddtprune successfully reduces the number of entries
# in the DDT.
#
# STRATEGY:
# 1. Create a pool with dedup=on
# 2. Add non-duplicate entries to the DDT
# 3. ddtprune all entries
# 4. Remove the file
# 5. Verify there's no space leak
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/events/events_common.kshlib
verify_runnable "both"
log_assert "Verify DDT pruning does not cause space leak"
# We set the dedup log txg interval to 1, to get a log flush every txg,
# effectively disabling the log. Without this it's hard to predict when
# entries appear in the DDT ZAP
log_must save_tunable DEDUP_LOG_TXG_MAX
log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000
function cleanup
{
if poolexists $TESTPOOL ; then
destroy_pool $TESTPOOL
fi
log_must restore_tunable DEDUP_LOG_TXG_MAX
log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
}
log_onexit cleanup
log_must zpool create -f $TESTPOOL $DISKS
log_must zfs create -o dedup=on $TESTPOOL/$TESTFS
typeset mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
log_must dd if=/dev/urandom of=$mountpoint/f1 bs=1M count=16
# We seems to need some amount of txg sync here to make it more consistently
# reproducible
for i in $(seq 50); do
zpool sync $TESTPOOL
done
log_must zpool ddtprune -p 100 $TESTPOOL
log_must rm $mountpoint/f1
sync_pool $TESTPOOL
zdb_out=$(zdb -bcc $TESTPOOL)
echo "$zdb_out"
if echo "$zdb_out" | grep -q "leaked space"; then
log_fail "DDT pruning causes space leak"
fi
log_pass "DDT pruning does not cause space leak"
@@ -0,0 +1,82 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2025 iXsystems, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/snapshot/snapshot.cfg
#
# DESCRIPTION:
# Verify that parallel snapshot automount operations don't cause AVL tree
# panic due to duplicate mount attempts.
#
# STRATEGY:
# 1. Create a filesystem with snapdir=visible
# 2. Create a snapshot
# 3. Trigger parallel ls operations on the snapshot directory
# 4. Verify no kernel panic occurred and snapshot is accessible
#
function cleanup
{
destroy_pool $TESTPOOL
}
verify_runnable "both"
log_assert "Verify parallel snapshot automount doesn't cause AVL tree panic"
log_onexit cleanup
# Create pool and filesystem
create_pool $TESTPOOL $DISKS
log_must zfs create -o snapdir=visible -o mountpoint=$TESTDIR $TESTPOOL/$TESTFS
# Create a snapshot
log_must zfs snapshot $SNAPFS
# Trigger parallel automount operations to reproduce the race condition.
# Multiple concurrent ls operations will attempt to automount the same
# unmounted snapshot, which previously could cause duplicate mount helpers
# and AVL tree panic.
snapdir_path="$TESTDIR/.zfs/snapshot/$TESTSNAP"
for i in {1..100}
do
ls $snapdir_path >/dev/null 2>&1 &
done
# Wait for all background processes to complete
wait
# Verify the snapshot is accessible and properly mounted after parallel access
log_must ls $snapdir_path
# Verify we can unmount the filesystem cleanly. This confirms no processes
# are stuck in a syscall and all automated snapshots were unmounted properly.
# If the AVL panic occurred, unmount would fail.
log_must zfs unmount $TESTPOOL/$TESTFS
log_pass "Parallel snapshot automount completed without AVL tree panic"
@@ -73,7 +73,6 @@ function do_test {
block_device_wait $zvolpath
# Write using sync (creates FLUSH calls after writes, but not FUA)
old_vdev_writes=$(get_sync $DISK1)
old_log_writes=$(get_sync $datafile3)
log_must fio --name=write_iops --size=5M \
@@ -81,20 +80,13 @@ function do_test {
--iodepth=1 --rw=randwrite --group_reporting=1 \
--filename=$zvolpath --sync=1
vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes))
log_writes=$(( $(get_sync $datafile3) - $old_log_writes))
# When we're doing sync writes, we should see many more writes go to
# the log vs the first vdev. Experiments show anywhere from a 160-320x
# ratio of writes to the log vs the first vdev (due to some straggler
# writes to the first vdev).
#
# Check that we have a large ratio (100x) of sync writes going to the
# log device
ratio=$(($log_writes / $vdev_writes))
log_note "Got $log_writes log writes, $vdev_writes vdev writes."
if [ $ratio -lt 100 ] ; then
log_fail "Expected > 100x more log writes than vdev writes. "
# When doing sync writes, we should see at least one SLOG write per
# block (5MB / 4KB) == 1280.
log_note "Got $log_writes log writes."
if [ $log_writes -lt 1280 ] ; then
log_fail "Expected >= 1280 log writes. "
fi
# Create a data file
+21 -3
View File
@@ -433,6 +433,9 @@
/* iter_is_ubuf() is available */
/* #undef HAVE_ITER_IS_UBUF */
/* kasan_enabled() is GPL-only */
/* #undef HAVE_KASAN_ENABLED_GPL_ONLY */
/* kernel has kernel_fpu_* functions */
/* #undef HAVE_KERNEL_FPU */
@@ -826,6 +829,9 @@
/* make_request_fn() return type */
/* #undef MAKE_REQUEST_FN_RET */
/* The size of 'off_t', as computed by sizeof. */
/* #undef SIZEOF_OFF_T */
/* using complete_and_exit() instead */
/* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */
@@ -856,7 +862,7 @@
/* #undef ZFS_DEVICE_MINOR */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.4.99-248-FreeBSD_g89f729dcc"
#define ZFS_META_ALIAS "zfs-2.4.99-292-FreeBSD_g962e68865"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@@ -865,7 +871,7 @@
/* #undef ZFS_META_DATA */
/* Define the maximum compatible kernel version. */
#define ZFS_META_KVER_MAX "6.17"
#define ZFS_META_KVER_MAX "6.18"
/* Define the minimum compatible kernel version. */
#define ZFS_META_KVER_MIN "4.18"
@@ -886,10 +892,22 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "248-FreeBSD_g89f729dcc"
#define ZFS_META_RELEASE "292-FreeBSD_g962e68865"
/* Define the project version. */
#define ZFS_META_VERSION "2.4.99"
/* count is located in percpu_ref.data */
/* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */
/* Number of bits in a file offset, on hosts where this is settable. */
/* #undef _FILE_OFFSET_BITS */
/* Define to 1 on platforms where this makes off_t a 64-bit type. */
/* #undef _LARGE_FILES */
/* Number of bits in time_t, on hosts where this is settable. */
/* #undef _TIME_BITS */
/* Define to 1 on platforms where this makes time_t a 64-bit type. */
/* #undef __MINGW_USE_VC2005_COMPAT */
+1 -1
View File
@@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.4.99-220-ge63d026b9"
#define ZFS_META_GITREV "zfs-2.4.99-292-g962e68865"