zfs: merge openzfs/zfs@b10992582

Notable upstream pull request merges: #15892 -multiple Fast Dedup: Introduce the FDT on-disk format and feature flag #15893 -multiple Fast Dedup: “flat” DDT entry format #15895 -multiple Fast Dedup: FDT-log feature #16239 6be8bf555 zpool: Provide GUID to zpool-reguid(8) with -g #16277 -multiple Fast Dedup: prune unique entries #16316 5807de90a Fix null ptr deref when renaming a zvol with snaps and snapdev=visible #16343 77a797a38 Enable L2 cache of all (MRU+MFU) metadata but MFU data only #16446 83f359245 FreeBSD: fix build without kernel option MAC #16449 963e6c9f3 Fix incorrect error report on vdev attach/replace #16505 b10992582 spa_prop_get: require caller to supply output nvlist Obtained from: OpenZFS OpenZFS commit: b109925820
2024-10-18 02:19:39 +00:00 · 2024-09-09 18:13:02 +02:00 · 2024-09-09 18:13:02 +02:00 · e2df9bb441
commit e2df9bb441
parent f05795e3f6 b109925820
132 changed files with 7382 additions and 1491 deletions
--- a/cddl/lib/libicp/Makefile
+++ b/cddl/lib/libicp/Makefile
@ -103,6 +103,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
 CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
 CFLAGS+= -DHAVE_ISSETUGID
 CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
--- a/cddl/lib/libicp_rescue/Makefile
+++ b/cddl/lib/libicp_rescue/Makefile
@ -100,6 +100,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
 CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
 CFLAGS+= -DHAVE_ISSETUGID -UHAVE_AVX -DRESCUE
 CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
--- a/cddl/lib/libzfs/Makefile
+++ b/cddl/lib/libzfs/Makefile
@ -63,10 +63,10 @@ KERNEL_C = \
        zfs_fletcher_superscalar4.c \
        zfs_namecheck.c \
        zfs_prop.c \
+        zfs_valstr.c \
        zpool_prop.c \
        zprop_common.c

-
 ARCH_C =
 .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386"
 ARCH_C += 	zfs_fletcher_intel.c \
@ -92,6 +92,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
 CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
--- a/cddl/lib/libzpool/Makefile
+++ b/cddl/lib/libzpool/Makefile
@ -1,5 +1,7 @@
 ZFSTOP=	${SRCTOP}/sys/contrib/openzfs

+.PATH: ${ZFSTOP}/lib/libzpool
+
 # ZFS_COMMON_SRCS
 .PATH: ${ZFSTOP}/module/zfs
 .PATH: ${ZFSTOP}/module/zcommon
@ -14,8 +16,6 @@ ZFSTOP=	${SRCTOP}/sys/contrib/openzfs

 .PATH: ${ZFSTOP}/module/os/linux/zfs

-.PATH: ${ZFSTOP}/lib/libzpool
-
 .if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S)
 .PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}
 ATOMIC_SRCS=	opensolaris_atomic.S
@ -34,6 +34,7 @@ PACKAGE=	zfs
 LIB=		zpool

 USER_C = \
+	abd_os.c \
 	kernel.c \
 	taskq.c \
 	util.c
@ -51,7 +52,6 @@ KERNEL_C = \
 	zpool_prop.c \
 	zprop_common.c \
 	abd.c \
-	abd_os.c \
 	aggsum.c \
 	arc.c \
 	arc_os.c \
@ -67,6 +67,7 @@ KERNEL_C = \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@ -255,6 +256,7 @@ CFLAGS+= \
 	-I${ZFSTOP}/include \
 	-I${ZFSTOP}/lib/libspl/include \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd \
+	-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
 	-I${SRCTOP}/sys \
 	-I${ZFSTOP}/include/os/freebsd/zfs \
 	-I${SRCTOP}/cddl/compat/opensolaris/include \
--- a/cddl/sbin/zpool/Makefile
+++ b/cddl/sbin/zpool/Makefile
@ -22,6 +22,7 @@ MAN= \
 	zpool-create.8 \
 	zpool-destroy.8 \
 	zpool-detach.8 \
+	zpool-ddtprune.8 \
 	zpool-events.8 \
 	zpool-export.8 \
 	zpool-features.7 \
@ -66,6 +67,7 @@ CFLAGS+= \
 	-I${ZFSTOP}/include \
 	-I${ZFSTOP}/lib/libspl/include \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd \
+	-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
 	-I${SRCTOP}/sys \
 	-I${SRCTOP}/cddl/compat/opensolaris/include \
 	-I${ZFSTOP}/cmd/zpool \
--- a/cddl/usr.bin/zinject/Makefile
+++ b/cddl/usr.bin/zinject/Makefile
@ -15,6 +15,7 @@ CFLAGS+= \
 	-I${ZFSTOP}/include \
 	-I${ZFSTOP}/lib/libspl/include \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd \
+	-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
 	-I${SRCTOP}/sys \
 	-I${SRCTOP}/cddl/compat/opensolaris/include \
 	-I${ZFSTOP}/module/icp/include \
--- a/cddl/usr.bin/zstream/Makefile
+++ b/cddl/usr.bin/zstream/Makefile
@ -21,9 +21,11 @@ SYMLINKS=	${BINDIR}/zstream ${BINDIR}/zstreamdump
 WARNS?=	2
 CFLAGS+= \
 	-DIN_BASE \
+	-DZFS_DEBUG \
 	-I${ZFSTOP}/include \
 	-I${ZFSTOP}/lib/libspl/include \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd \
+	-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
 	-I${SRCTOP}/sys \
 	-I${SRCTOP}/cddl/compat/opensolaris/include \
 	-I${ZFSTOP}/module/icp/include \
--- a/cddl/usr.bin/ztest/Makefile
+++ b/cddl/usr.bin/ztest/Makefile
@ -15,6 +15,7 @@ CFLAGS+= \
 	-I${ZFSTOP}/include \
 	-I${ZFSTOP}/lib/libspl/include \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd \
+	-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
 	-I${SRCTOP}/cddl/compat/opensolaris/include \
 	-I${ZFSTOP}/module/icp/include \
 	-include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
--- a/cddl/usr.sbin/zdb/Makefile
+++ b/cddl/usr.sbin/zdb/Makefile
@ -18,6 +18,7 @@ CFLAGS+= \
 	-I${ZFSTOP}/lib/libspl/include \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd \
 	-I${ZFSTOP}/lib/libspl/include/os/freebsd/spl \
+	-I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \
 	-I${SRCTOP}/sys \
 	-include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \
 	-DHAVE_ISSETUGID
--- a/cddl/usr.sbin/zfsd/Makefile.common
+++ b/cddl/usr.sbin/zfsd/Makefile.common
@ -17,6 +17,7 @@ CFLAGS+= -DIN_BASE
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
 CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
 CFLAGS+= -I${SRCTOP}/cddl/usr.sbin
--- a/cddl/usr.sbin/zhack/Makefile
+++ b/cddl/usr.sbin/zhack/Makefile
@ -12,6 +12,7 @@ CFLAGS+= -DIN_BASE
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include 
 CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
--- a/lib/libbe/Makefile
+++ b/lib/libbe/Makefile
@ -57,6 +57,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include
 CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
--- a/sys/cddl/boot/zfs/zfssubr.c
+++ b/sys/cddl/boot/zfs/zfssubr.c
@ -107,7 +107,7 @@ typedef struct zio_checksum_info {
 #include "skein_zfs.c"

 #ifdef HAS_ZSTD_ZFS
-extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
+extern int zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
    size_t d_len, int n);
 #endif

@ -191,7 +191,7 @@ static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 	{NULL,			zle_decompress,		64,	"zle"},
 	{NULL,			lz4_decompress,		0,	"lz4"},
 #ifdef HAS_ZSTD_ZFS
-	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
+	{NULL,			zfs_zstd_decompress_buf, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
 #endif
 };

--- a/sys/conf/files
+++ b/sys/conf/files
@ -238,6 +238,7 @@ contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c		optional zfs compile-
 contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zcommon/zfs_namecheck.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zcommon/zfs_prop.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_valstr.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zcommon/zpool_prop.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zcommon/zprop_common.c		optional zfs compile-with "${ZFS_C}"

@ -270,6 +271,7 @@ contrib/openzfs/module/zfs/dbuf.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/dbuf_stats.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/dataset_kstats.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/ddt.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/ddt_log.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/ddt_stats.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/ddt_zap.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/dmu.c		optional zfs compile-with "${ZFS_C}"
--- a/sys/contrib/openzfs/META
+++ b/sys/contrib/openzfs/META
@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.9
+Linux-Maximum: 6.10
 Linux-Minimum: 3.10
--- a/sys/contrib/openzfs/cmd/Makefile.am
+++ b/sys/contrib/openzfs/cmd/Makefile.am
@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
 	libzfs.la


-zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)

 sbin_PROGRAMS   += zhack
 CPPCHECKTARGETS += zhack
@ -39,7 +39,7 @@ zhack_LDADD = \


 ztest_CFLAGS    = $(AM_CFLAGS) $(KERNEL_CFLAGS)
-ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)

 sbin_PROGRAMS   += ztest
 CPPCHECKTARGETS += ztest
--- a/sys/contrib/openzfs/cmd/mount_zfs.c
+++ b/sys/contrib/openzfs/cmd/mount_zfs.c
@ -269,8 +269,7 @@ main(int argc, char **argv)
 		return (MOUNT_USAGE);
 	}

-	if (!zfsutil || sloppy ||
-	    libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
+	if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 		zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
 	}

@ -337,7 +336,7 @@ main(int argc, char **argv)
 		    dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);

 	if (!fake) {
-		if (zfsutil && !sloppy &&
+		if (!remount && !sloppy &&
 		    !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 			error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
 			if (error) {
--- a/sys/contrib/openzfs/cmd/raidz_test/Makefile.am
+++ b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am
@ -1,5 +1,5 @@
 raidz_test_CFLAGS   = $(AM_CFLAGS)   $(KERNEL_CFLAGS)
-raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)

 bin_PROGRAMS    += raidz_test
 CPPCHECKTARGETS += raidz_test
--- a/sys/contrib/openzfs/cmd/zdb/Makefile.am
+++ b/sys/contrib/openzfs/cmd/zdb/Makefile.am
@ -1,4 +1,4 @@
-zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 zdb_CFLAGS   = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)

 sbin_PROGRAMS   += zdb
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@ -33,7 +33,7 @@
 *     under sponsorship from the FreeBSD Foundation.
 * Copyright (c) 2021 Allan Jude
 * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
- * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2023, 2024, Klara Inc.
 * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
 */

@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa)
 }

 static void
-dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    uint64_t index)
 {
-	const ddt_phys_t *ddp = dde->dde_phys;
-	const ddt_key_t *ddk = &dde->dde_key;
-	const char *types[4] = { "ditto", "single", "double", "triple" };
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;

-	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0)
+	for (p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
-		(void) printf("index %llx refcnt %llu %s %s\n",
-		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
-		    types[p], blkbuf);
+		(void) printf("index %llx refcnt %llu phys %d %s\n",
+		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
+		    p, blkbuf);
 	}
 }

@ -1956,11 +1958,37 @@ dump_dedup_ratio(const ddt_stat_t *dds)
 	    dedup, compress, copies, dedup * compress / copies);
 }

+static void
+dump_ddt_log(ddt_t *ddt)
+{
+	for (int n = 0; n < 2; n++) {
+		ddt_log_t *ddl = &ddt->ddt_log[n];
+
+		uint64_t count = avl_numnodes(&ddl->ddl_tree);
+		if (count == 0)
+			continue;
+
+		printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
+		    zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
+
+		if (dump_opt['D'] < 4)
+			continue;
+
+		ddt_lightweight_entry_t ddlwe;
+		uint64_t index = 0;
+		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
+			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+			dump_ddt_entry(ddt, &ddlwe, index++);
+		}
+	}
+}
+
 static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
-	ddt_entry_t dde;
+	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
@ -2001,8 +2029,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)

 	(void) printf("%s contents:\n\n", name);

-	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
-		dump_dde(ddt, &dde, walk);
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
+		dump_ddt_entry(ddt, &ddlwe, walk);

 	ASSERT3U(error, ==, ENOENT);

@ -2017,7 +2045,7 @@ dump_all_ddts(spa_t *spa)

 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
-		if (!ddt)
+		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 			continue;
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
@ -2025,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
 				dump_ddt(ddt, type, class);
 			}
 		}
+		dump_ddt_log(ddt);
 	}

 	ddt_get_dedup_stats(spa, &dds_total);
@ -2043,6 +2072,32 @@ dump_all_ddts(spa_t *spa)
 	}

 	dump_dedup_ratio(&dds_total);
+
+	/*
+	 * Dump a histogram of unique class entry age
+	 */
+	if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
+		ddt_age_histo_t histogram;
+
+		(void) printf("DDT walk unique, building age histogram...\n");
+		ddt_prune_walk(spa, 0, &histogram);
+
+		/*
+		 * print out histogram for unique entry class birth
+		 */
+		if (histogram.dah_entries > 0) {
+			(void) printf("%5s  %9s  %4s\n",
+			    "age", "blocks", "amnt");
+			(void) printf("%5s  %9s  %4s\n",
+			    "-----", "---------", "----");
+			for (int i = 0; i < HIST_BINS; i++) {
+				(void) printf("%5d  %9d %4d%%\n", 1 << i,
+				    (int)histogram.dah_age_histo[i],
+				    (int)((histogram.dah_age_histo[i] * 100) /
+				    histogram.dah_entries));
+			}
+		}
+	}
 }

 static void
@ -3287,9 +3342,45 @@ fuid_table_destroy(void)
 	}
 }

+/*
+ * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
+ * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
+ * wouldn't want to anyway), but if we don't clean up the presence of stuff on
+ * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
+ *
+ * Note that this is not a particularly efficient way to do this, but
+ * ddt_remove() is the only public method that can do the work we need, and it
+ * requires the right locks and etc to do the job. This is only ever called
+ * during zdb shutdown so efficiency is not especially important.
+ */
+static void
+zdb_ddt_cleanup(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		ddt_enter(ddt);
+		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
+		while (dde) {
+			next = AVL_NEXT(&ddt->ddt_tree, dde);
+			dde->dde_io = NULL;
+			ddt_remove(ddt, dde);
+			dde = next;
+		}
+		ddt_exit(ddt);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+}
+
 static void
 zdb_exit(int reason)
 {
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
@ -4592,7 +4683,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
-	abd_t *abd;
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
@ -4646,20 +4736,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
-		default:
-			abd = abd_alloc_for_io(asize, B_TRUE);
+		default: {
+			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
-			if (zio_decompress_data(L2BLK_GET_COMPRESS(
-			    (&lbps[0])->lbp_prop), abd, &this_lb,
-			    asize, sizeof (this_lb), NULL) != 0) {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, &this_lb,
+			    sizeof (this_lb));
+			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
+			    (&lbps[0])->lbp_prop), abd, &dabd,
+			    asize, sizeof (this_lb), NULL);
+			abd_free(&dabd);
+			abd_free(abd);
+			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
-				abd_free(abd);
 				goto out;
 			}
-			abd_free(abd);
 			break;
 		}
+		}

 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
@ -5633,7 +5728,6 @@ static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
    dmu_object_type_t type)
 {
-	uint64_t refcnt = 0;
 	int i;

 	ASSERT(type < ZDB_OT_TOTAL);
@ -5641,8 +5735,167 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;

+	/*
+	 * This flag controls if we will issue a claim for the block while
+	 * counting it, to ensure that all blocks are referenced in space maps.
+	 * We don't issue claims if we're not doing leak tracking, because it's
+	 * expensive if the user isn't interested. We also don't claim the
+	 * second or later occurences of cloned or dedup'd blocks, because we
+	 * already claimed them the first time.
+	 */
+	boolean_t do_claim = !dump_opt['L'];
+
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);

+	blkptr_t tempbp;
+	if (BP_GET_DEDUP(bp)) {
+		/*
+		 * Dedup'd blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * We use the existing dedup system to track what we've seen.
+		 * The first time we see a block, we do a ddt_lookup() to see
+		 * if it exists in the DDT. If we're doing leak tracking, we
+		 * claim the block at this time.
+		 *
+		 * Each time we see a block, we reduce the refcount in the
+		 * entry by one, and add to the size and count of dedup'd
+		 * blocks to report at the end.
+		 */
+
+		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
+
+		ddt_enter(ddt);
+
+		/*
+		 * Find the block. This will create the entry in memory, but
+		 * we'll know if that happened by its refcount.
+		 */
+		ddt_entry_t *dde = ddt_lookup(ddt, bp);
+
+		/*
+		 * ddt_lookup() can return NULL if this block didn't exist
+		 * in the DDT and creating it would take the DDT over its
+		 * quota. Since we got the block from disk, it must exist in
+		 * the DDT, so this can't happen. However, when unique entries
+		 * are pruned, the dedup bit can be set with no corresponding
+		 * entry in the DDT.
+		 */
+		if (dde == NULL) {
+			ddt_exit(ddt);
+			goto skipped;
+		}
+
+		/* Get the phys for this variant */
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+
+		/*
+		 * This entry may have multiple sets of DVAs. We must claim
+		 * each set the first time we see them in a real block on disk,
+		 * or count them on subsequent occurences. We don't have a
+		 * convenient way to track the first time we see each variant,
+		 * so we repurpose dde_io as a set of "seen" flag bits. We can
+		 * do this safely in zdb because it never writes, so it will
+		 * never have a writing zio for this block in that pointer.
+		 */
+		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
+		if (!seen)
+			dde->dde_io =
+			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
+
+		/* Consume a reference for this block. */
+		if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
+			ddt_phys_decref(dde->dde_phys, v);
+
+		/*
+		 * If this entry has a single flat phys, it may have been
+		 * extended with additional DVAs at some time in its life.
+		 * This block might be from before it was fully extended, and
+		 * so have fewer DVAs.
+		 *
+		 * If this is the first time we've seen this block, and we
+		 * claimed it as-is, then we would miss the claim on some
+		 * number of DVAs, which would then be seen as leaked.
+		 *
+		 * In all cases, if we've had fewer DVAs, then the asize would
+		 * be too small, and would lead to the pool apparently using
+		 * more space than allocated.
+		 *
+		 * To handle this, we copy the canonical set of DVAs from the
+		 * entry back to the block pointer before we claim it.
+		 */
+		if (v == DDT_PHYS_FLAT) {
+			ASSERT3U(BP_GET_BIRTH(bp), ==,
+			    ddt_phys_birth(dde->dde_phys, v));
+			tempbp = *bp;
+			ddt_bp_fill(dde->dde_phys, v, &tempbp,
+			    BP_GET_BIRTH(bp));
+			bp = &tempbp;
+		}
+
+		if (seen) {
+			/*
+			 * The second or later time we see this block,
+			 * it's a duplicate and we count it.
+			 */
+			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_dedup_blocks++;
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+
+		ddt_exit(ddt);
+	} else if (zcb->zcb_brt_is_active &&
+	    brt_maybe_exists(zcb->zcb_spa, bp)) {
+		/*
+		 * Cloned blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * To do this, we keep our own in-memory BRT. For each block
+		 * we haven't seen before, we look it up in the real BRT and
+		 * if its there, we note it and its refcount then proceed as
+		 * normal. If we see the block again, we count it as a clone
+		 * and then give it no further consideration.
+		 */
+		zdb_brt_entry_t zbre_search, *zbre;
+		avl_index_t where;
+
+		zbre_search.zbre_dva = bp->blk_dva[0];
+		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+		if (zbre == NULL) {
+			/* Not seen before; track it */
+			uint64_t refcnt =
+			    brt_entry_get_refcount(zcb->zcb_spa, bp);
+			if (refcnt > 0) {
+				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+				    UMEM_NOFAIL);
+				zbre->zbre_dva = bp->blk_dva[0];
+				zbre->zbre_refcount = refcnt;
+				avl_insert(&zcb->zcb_brt, zbre, where);
+			}
+		} else  {
+			/*
+			 * Second or later occurrence, count it and take a
+			 * refcount.
+			 */
+			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_clone_blocks++;
+
+			zbre->zbre_refcount--;
+			if (zbre->zbre_refcount == 0) {
+				avl_remove(&zcb->zcb_brt, zbre);
+				umem_free(zbre, sizeof (zdb_brt_entry_t));
+			}
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+	}
+
+skipped:
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
@ -5745,71 +5998,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);

-	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
-		/*
-		 * Cloned blocks are special. We need to count them, so we can
-		 * later uncount them when reporting leaked space, and we must
-		 * only claim them them once.
-		 *
-		 * To do this, we keep our own in-memory BRT. For each block
-		 * we haven't seen before, we look it up in the real BRT and
-		 * if its there, we note it and its refcount then proceed as
-		 * normal. If we see the block again, we count it as a clone
-		 * and then give it no further consideration.
-		 */
-		zdb_brt_entry_t zbre_search, *zbre;
-		avl_index_t where;
-
-		zbre_search.zbre_dva = bp->blk_dva[0];
-		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
-		if (zbre != NULL) {
-			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
-			zcb->zcb_clone_blocks++;
-
-			zbre->zbre_refcount--;
-			if (zbre->zbre_refcount == 0) {
-				avl_remove(&zcb->zcb_brt, zbre);
-				umem_free(zbre, sizeof (zdb_brt_entry_t));
-			}
-			return;
-		}
-
-		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
-		if (crefcnt > 0) {
-			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
-			    UMEM_NOFAIL);
-			zbre->zbre_dva = bp->blk_dva[0];
-			zbre->zbre_refcount = crefcnt;
-			avl_insert(&zcb->zcb_brt, zbre, where);
-		}
-	}
-
-	if (dump_opt['L'])
+	if (!do_claim)
 		return;

-	if (BP_GET_DEDUP(bp)) {
-		ddt_t *ddt;
-		ddt_entry_t *dde;
-
-		ddt = ddt_select(zcb->zcb_spa, bp);
-		ddt_enter(ddt);
-		dde = ddt_lookup(ddt, bp, B_FALSE);
-
-		if (dde == NULL) {
-			refcnt = 0;
-		} else {
-			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
-			ddt_phys_decref(ddp);
-			refcnt = ddp->ddp_refcnt;
-			if (ddt_phys_total_refcnt(dde) == 0)
-				ddt_remove(ddt, dde);
-		}
-		ddt_exit(ddt);
-	}
-
-	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
-	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
+	    ZIO_FLAG_CANFAIL)));
 }

 static void
@ -6120,49 +6314,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
 	return (counts);
 }

-static void
-zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
-{
-	ddt_bookmark_t ddb = {0};
-	ddt_entry_t dde;
-	int error;
-	int p;
-
-	ASSERT(!dump_opt['L']);
-
-	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
-		blkptr_t blk;
-		ddt_phys_t *ddp = dde.dde_phys;
-
-		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
-			return;
-
-		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
-		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
-		VERIFY(ddt);
-
-		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-			if (ddp->ddp_phys_birth == 0)
-				continue;
-			ddt_bp_create(ddb.ddb_checksum,
-			    &dde.dde_key, ddp, &blk);
-			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
-			} else {
-				zcb->zcb_dedup_asize +=
-				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
-				zcb->zcb_dedup_blocks++;
-			}
-		}
-
-		ddt_enter(ddt);
-		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
-		ddt_exit(ddt);
-	}
-
-	ASSERT(error == ENOENT);
-}
-
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
@ -6546,10 +6697,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
-
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-	zdb_ddt_leak_init(spa, zcb);
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }

 static boolean_t
@ -6814,6 +6961,8 @@ dump_block_stats(spa_t *spa)
 	int e, c, err;
 	bp_embedded_type_t i;

+	ddt_prefetch_all(spa);
+
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);

 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
@ -6938,7 +7087,6 @@ dump_block_stats(spa_t *spa)
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
-		leaks = B_TRUE;
 	}

 	if (tzb->zb_count == 0) {
@ -7272,29 +7420,27 @@ dump_simulated_ddt(spa_t *spa)
 	spa_config_exit(spa, SCL_CONFIG, FTAG);

 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
-		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);

-		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
-		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
-		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
-		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];

-		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
-		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
-		dds.dds_ref_psize = zdde->zdde_ref_psize;
-		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
+		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
+		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
+		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;

-		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
-		    &dds, 0);
+		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
+		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
+		dds->dds_ref_psize += zdde->zdde_ref_psize;
+		dds->dds_ref_dsize += zdde->zdde_ref_dsize;

 		umem_free(zdde, sizeof (*zdde));
 	}

 	avl_destroy(&t);

-	ddt_histogram_stat(&dds_total, &ddh_total);
+	ddt_histogram_total(&dds_total, &ddh_total);

 	(void) printf("Simulated DDT histogram:\n");

@ -8022,16 +8168,28 @@ dump_mos_leaks(spa_t *spa)

 	mos_leak_vdev(spa->spa_root_vdev);

-	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
-		for (uint64_t type = 0; type < DDT_TYPES; type++) {
-			for (uint64_t cksum = 0;
-			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
-				ddt_t *ddt = spa->spa_ddt[cksum];
-				if (!ddt)
-					continue;
+	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
+			continue;
+
+		/* DDT store objects */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
+			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
+
+		/* FDT container */
+		if (ddt->ddt_version == DDT_VERSION_FDT)
+			mos_obj_refd(ddt->ddt_dir_object);
+
+		/* FDT log objects */
+		if (ddt->ddt_flags & DDT_FLAG_LOG) {
+			mos_obj_refd(ddt->ddt_log[0].ddl_object);
+			mos_obj_refd(ddt->ddt_log[1].ddl_object);
+		}
 	}

 	if (spa->spa_brt != NULL) {
@ -8499,13 +8657,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);

+	abd_t labd, labd2;
+	abd_get_from_buf_struct(&labd, lbuf, lsize);
+	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
+
+	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
-	    lbuf, psize, lsize, NULL) == 0 &&
+	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
-	    lbuf2, psize, lsize, NULL) == 0 &&
+	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
-		return (B_TRUE);
-	return (B_FALSE);
+		ret = B_TRUE;
+
+	abd_free(&labd2);
+	abd_free(&labd);
+
+	return (ret);
 }

 static uint64_t
@ -9624,6 +9791,9 @@ main(int argc, char **argv)
 	}

 fini:
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
--- a/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c
@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 		const char *failmode = NULL;
 		boolean_t checkremove = B_FALSE;
 		uint32_t pri = 0;
-		int32_t flags = 0;

 		/*
 		 * If this is a checksum or I/O error, then toss it into the
@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 			}
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+			uint64_t flags = 0;
+			int32_t flags32 = 0;
 			/*
 			 * We ignore ereports for checksum errors generated by
 			 * scrub/resilver I/O to avoid potentially further
 			 * degrading the pool while it's being repaired.
+			 *
+			 * Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
+			 * be int32. To allow newer zed to work on older
+			 * kernels, if we don't find the flags, we look for
+			 * the older ones too.
 			 */
 			if (((nvlist_lookup_uint32(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
 			    (pri == ZIO_PRIORITY_SCRUB ||
 			    pri == ZIO_PRIORITY_REBUILD)) ||
-			    ((nvlist_lookup_int32(nvl,
+			    ((nvlist_lookup_uint64(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
-			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
+			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
+			    ((nvlist_lookup_int32(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
+			    (flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
 				fmd_hdl_debug(hdl, "ignoring '%s' for "
 				    "scrub/resilver I/O", class);
 				return;
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@ -75,6 +75,7 @@
 #include "zpool_util.h"
 #include "zfs_comutil.h"
 #include "zfeature_common.h"
+#include "zfs_valstr.h"

 #include "statcommon.h"

@ -130,6 +131,8 @@ static int zpool_do_version(int, char **);

 static int zpool_do_wait(int, char **);

+static int zpool_do_ddt_prune(int, char **);
+
 static int zpool_do_help(int argc, char **argv);

 static zpool_compat_status_t zpool_do_load_compat(
@ -170,6 +173,7 @@ typedef enum {
 	HELP_CLEAR,
 	HELP_CREATE,
 	HELP_CHECKPOINT,
+	HELP_DDT_PRUNE,
 	HELP_DESTROY,
 	HELP_DETACH,
 	HELP_EXPORT,
@ -426,6 +430,8 @@ static zpool_command_t command_table[] = {
 	{ "sync",	zpool_do_sync,		HELP_SYNC		},
 	{ NULL },
 	{ "wait",	zpool_do_wait,		HELP_WAIT		},
+	{ NULL },
+	{ "ddtprune",	zpool_do_ddt_prune,	HELP_DDT_PRUNE		},
 };

 #define	NCOMMAND	(ARRAY_SIZE(command_table))
@ -537,7 +543,7 @@ get_usage(zpool_help_t idx)
 		    "\t    [-o property=value] <pool> <newpool> "
 		    "[<device> ...]\n"));
 	case HELP_REGUID:
-		return (gettext("\treguid <pool>\n"));
+		return (gettext("\treguid [-g guid] <pool>\n"));
 	case HELP_SYNC:
 		return (gettext("\tsync [pool] ...\n"));
 	case HELP_VERSION:
@ -545,6 +551,8 @@ get_usage(zpool_help_t idx)
 	case HELP_WAIT:
 		return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
 		    "<pool> [interval]\n"));
+	case HELP_DDT_PRUNE:
+		return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
 	default:
 		__builtin_unreachable();
 	}
@ -2025,7 +2033,7 @@ zpool_do_create(int argc, char **argv)
 				char *end;
 				u_longlong_t ver;

-				ver = strtoull(propval, &end, 10);
+				ver = strtoull(propval, &end, 0);
 				if (*end == '\0' &&
 				    ver < SPA_VERSION_FEATURES) {
 					enable_pool_features = B_FALSE;
@ -8232,19 +8240,32 @@ zpool_do_clear(int argc, char **argv)
 }

 /*
- * zpool reguid <pool>
+ * zpool reguid [-g <guid>] <pool>
 */
 int
 zpool_do_reguid(int argc, char **argv)
 {
+	uint64_t guid;
+	uint64_t *guidp = NULL;
 	int c;
+	char *endptr;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;

 	/* check options */
-	while ((c = getopt(argc, argv, "")) != -1) {
+	while ((c = getopt(argc, argv, "g:")) != -1) {
 		switch (c) {
+		case 'g':
+			errno = 0;
+			guid = strtoull(optarg, &endptr, 10);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("invalid GUID: %s\n"), optarg);
+				usage(B_FALSE);
+			}
+			guidp = &guid;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@ -8270,7 +8291,7 @@ zpool_do_reguid(int argc, char **argv)
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);

-	ret = zpool_reguid(zhp);
+	ret = zpool_set_guid(zhp, guidp);

 	zpool_close(zhp);
 	return (ret);
@ -11916,6 +11937,7 @@ static void
 zpool_do_events_nvprint(nvlist_t *nvl, int depth)
 {
 	nvpair_t *nvp;
+	static char flagstr[256];

 	for (nvp = nvlist_next_nvpair(nvl, NULL);
 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
@ -11975,7 +11997,21 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)

 		case DATA_TYPE_UINT32:
 			(void) nvpair_value_uint32(nvp, &i32);
-			printf(gettext("0x%x"), i32);
+			if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE) == 0 ||
+			    strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE) == 0) {
+				zfs_valstr_zio_stage(i32, flagstr,
+				    sizeof (flagstr));
+				printf(gettext("0x%x [%s]"), i32, flagstr);
+			} else if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY) == 0) {
+				zfs_valstr_zio_priority(i32, flagstr,
+				    sizeof (flagstr));
+				printf(gettext("0x%x [%s]"), i32, flagstr);
+			} else {
+				printf(gettext("0x%x"), i32);
+			}
 			break;

 		case DATA_TYPE_INT64:
@ -11996,6 +12032,12 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)
 				printf(gettext("\"%s\" (0x%llx)"),
 				    zpool_state_to_name(i64, VDEV_AUX_NONE),
 				    (u_longlong_t)i64);
+			} else if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS) == 0) {
+				zfs_valstr_zio_flag(i64, flagstr,
+				    sizeof (flagstr));
+				printf(gettext("0x%llx [%s]"),
+				    (u_longlong_t)i64, flagstr);
 			} else {
 				printf(gettext("0x%llx"), (u_longlong_t)i64);
 			}
@ -13329,6 +13371,88 @@ found:;
 	return (error);
 }

+/*
+ * zpool ddtprune -d|-p <amount> <pool>
+ *
+ *       -d <days>	Prune entries <days> old and older
+ *       -p <percent>	Prune <percent> amount of entries
+ *
+ * Prune single reference entries from DDT to satisfy the amount specified.
+ */
+int
+zpool_do_ddt_prune(int argc, char **argv)
+{
+	zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
+	uint64_t amount = 0;
+	zpool_handle_t *zhp;
+	char *endptr;
+	int c;
+
+	while ((c = getopt(argc, argv, "d:p:")) != -1) {
+		switch (c) {
+		case 'd':
+			if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
+				(void) fprintf(stderr, gettext("-d cannot be "
+				    "combined with -p option\n"));
+				usage(B_FALSE);
+			}
+			errno = 0;
+			amount = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0' || amount == 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid days value\n"));
+				usage(B_FALSE);
+			}
+			amount *= 86400;	/* convert days to seconds */
+			unit = ZPOOL_DDT_PRUNE_AGE;
+			break;
+		case 'p':
+			if (unit == ZPOOL_DDT_PRUNE_AGE) {
+				(void) fprintf(stderr, gettext("-p cannot be "
+				    "combined with -d option\n"));
+				usage(B_FALSE);
+			}
+			errno = 0;
+			amount = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0' ||
+			    amount == 0 || amount > 100) {
+				(void) fprintf(stderr,
+				    gettext("invalid percentage value\n"));
+				usage(B_FALSE);
+			}
+			unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (unit == ZPOOL_DDT_PRUNE_NONE) {
+		(void) fprintf(stderr,
+		    gettext("missing amount option (-d|-p <value>)\n"));
+		usage(B_FALSE);
+	} else if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(B_FALSE);
+	} else if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+	zhp = zpool_open(g_zfs, argv[0]);
+	if (zhp == NULL)
+		return (-1);
+
+	int error = zpool_ddt_prune(zhp, unit, amount);
+
+	zpool_close(zhp);
+
+	return (error);
+}
+
 static int
 find_command_idx(const char *command, int *idx)
 {
--- a/sys/contrib/openzfs/cmd/zstream/Makefile.am
+++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am
@ -1,3 +1,5 @@
+zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
+
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream

--- a/sys/contrib/openzfs/cmd/zstream/zstream_decompress.c
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_decompress.c
@ -22,6 +22,8 @@
 /*
 * Copyright 2022 Axcient.  All rights reserved.
 * Use is subject to license terms.
+ *
+ * Copyright (c) 2024, Klara, Inc.
 */

 #include <err.h>
@ -257,83 +259,73 @@ zstream_do_decompress(int argc, char *argv[])
 			ENTRY e = {.key = key};

 			p = hsearch(e, FIND);
-			if (p != NULL) {
-				zio_decompress_func_t *xfunc = NULL;
-				switch ((enum zio_compress)(intptr_t)p->data) {
-				case ZIO_COMPRESS_OFF:
-					xfunc = NULL;
-					break;
-				case ZIO_COMPRESS_LZJB:
-					xfunc = lzjb_decompress;
-					break;
-				case ZIO_COMPRESS_GZIP_1:
-					xfunc = gzip_decompress;
-					break;
-				case ZIO_COMPRESS_ZLE:
-					xfunc = zle_decompress;
-					break;
-				case ZIO_COMPRESS_LZ4:
-					xfunc = lz4_decompress_zfs;
-					break;
-				case ZIO_COMPRESS_ZSTD:
-					xfunc = zfs_zstd_decompress;
-					break;
-				default:
-					assert(B_FALSE);
-				}
-
-
-				/*
-				 * Read and decompress the block
-				 */
-				char *lzbuf = safe_calloc(payload_size);
-				(void) sfread(lzbuf, payload_size, stdin);
-				if (xfunc == NULL) {
-					memcpy(buf, lzbuf, payload_size);
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					if (verbose)
-						fprintf(stderr, "Resetting "
-						    "compression type to off "
-						    "for ino %llu offset "
-						    "%llu\n",
-						    (u_longlong_t)
-						    drrw->drr_object,
-						    (u_longlong_t)
-						    drrw->drr_offset);
-				} else if (0 != xfunc(lzbuf, buf,
-				    payload_size, payload_size, 0)) {
-					/*
-					 * The block must not be compressed,
-					 * at least not with this compression
-					 * type, possibly because it gets
-					 * written multiple times in this
-					 * stream.
-					 */
-					warnx("decompression failed for "
-					    "ino %llu offset %llu",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-					memcpy(buf, lzbuf, payload_size);
-				} else if (verbose) {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					fprintf(stderr, "successfully "
-					    "decompressed ino %llu "
-					    "offset %llu\n",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-				} else {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-				}
-				free(lzbuf);
-			} else {
+			if (p == NULL) {
 				/*
 				 * Read the contents of the block unaltered
 				 */
 				(void) sfread(buf, payload_size, stdin);
+				break;
 			}
+
+			/*
+			 * Read and decompress the block
+			 */
+			enum zio_compress c =
+			    (enum zio_compress)(intptr_t)p->data;
+
+			if (c == ZIO_COMPRESS_OFF) {
+				(void) sfread(buf, payload_size, stdin);
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
+				if (verbose)
+					fprintf(stderr,
+					    "Resetting compression type to "
+					    "off for ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				break;
+			}
+
+			uint64_t lsize = drrw->drr_logical_size;
+			ASSERT3U(payload_size, <=, lsize);
+
+			char *lzbuf = safe_calloc(payload_size);
+			(void) sfread(lzbuf, payload_size, stdin);
+
+			abd_t sabd, dabd;
+			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
+			abd_get_from_buf_struct(&dabd, buf, lsize);
+			int err = zio_decompress_data(c, &sabd, &dabd,
+			    payload_size, lsize, NULL);
+			abd_free(&dabd);
+			abd_free(&sabd);
+
+			if (err == 0) {
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
+				payload_size = lsize;
+				if (verbose) {
+					fprintf(stderr,
+					    "successfully decompressed "
+					    "ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				}
+			} else {
+				/*
+				 * The block must not be compressed, at least
+				 * not with this compression type, possibly
+				 * because it gets written multiple times in
+				 * this stream.
+				 */
+				warnx("decompression failed for "
+				    "ino %llu offset %llu",
+				    (u_longlong_t)drrw->drr_object,
+				    (u_longlong_t)drrw->drr_offset);
+				memcpy(buf, lzbuf, payload_size);
+			}
+
+			free(lzbuf);
 			break;
 		}

--- a/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c
@ -22,10 +22,9 @@
 /*
 * Copyright 2022 Axcient.  All rights reserved.
 * Use is subject to license terms.
- */
-
-/*
+ *
 * Copyright (c) 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Klara, Inc.
 */

 #include <err.h>
@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
 	dmu_replay_record_t *drr = &thedrr;
 	zio_cksum_t stream_cksum;
 	int c;
-	int level = -1;
+	int level = 0;

 	while ((c = getopt(argc, argv, "l:")) != -1) {
 		switch (c) {
@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])

 	if (argc != 1)
 		zstream_usage();
-	int type = 0;
-	zio_compress_info_t *cinfo = NULL;
-	if (0 == strcmp(argv[0], "off")) {
-		type = ZIO_COMPRESS_OFF;
-		cinfo = &zio_compress_table[type];
-	} else if (0 == strcmp(argv[0], "inherit") ||
-	    0 == strcmp(argv[0], "empty") ||
-	    0 == strcmp(argv[0], "on")) {
-		// Fall through to invalid compression type case
-	} else {
-		for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
-			if (0 == strcmp(zio_compress_table[i].ci_name,
-			    argv[0])) {
-				cinfo = &zio_compress_table[i];
-				type = i;
-				break;
-			}
-		}
-	}
-	if (cinfo == NULL) {
-		fprintf(stderr, "Invalid compression type %s.\n",
-		    argv[0]);
-		exit(2);
-	}

-	if (cinfo->ci_compress == NULL) {
-		type = 0;
-		cinfo = &zio_compress_table[0];
+	enum zio_compress ctype;
+	if (strcmp(argv[0], "off") == 0) {
+		ctype = ZIO_COMPRESS_OFF;
+	} else {
+		for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
+			if (strcmp(argv[0],
+			    zio_compress_table[ctype].ci_name) == 0)
+				break;
+		}
+		if (ctype == ZIO_COMPRESS_FUNCTIONS ||
+		    zio_compress_table[ctype].ci_compress == NULL) {
+			fprintf(stderr, "Invalid compression type %s.\n",
+			    argv[0]);
+			exit(2);
+		}
 	}

 	if (isatty(STDIN_FILENO)) {
@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
 		exit(1);
 	}

+	abd_init();
 	fletcher_4_init();
 	zio_init();
 	zstd_init();
@ -247,63 +235,78 @@ zstream_do_recompress(int argc, char *argv[])
 				(void) sfread(buf, payload_size, stdin);
 				break;
 			}
-			if (drrw->drr_compressiontype >=
-			    ZIO_COMPRESS_FUNCTIONS) {
+			enum zio_compress dtype = drrw->drr_compressiontype;
+			if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
 				fprintf(stderr, "Invalid compression type in "
-				    "stream: %d\n", drrw->drr_compressiontype);
+				    "stream: %d\n", dtype);
 				exit(3);
 			}
-			zio_compress_info_t *dinfo =
-			    &zio_compress_table[drrw->drr_compressiontype];
+			if (zio_compress_table[dtype].ci_decompress == NULL)
+				dtype = ZIO_COMPRESS_OFF;

 			/* Set up buffers to minimize memcpys */
 			char *cbuf, *dbuf;
-			if (cinfo->ci_compress == NULL)
+			if (ctype == ZIO_COMPRESS_OFF)
 				dbuf = buf;
 			else
 				dbuf = safe_calloc(bufsz);

-			if (dinfo->ci_decompress == NULL)
+			if (dtype == ZIO_COMPRESS_OFF)
 				cbuf = dbuf;
 			else
 				cbuf = safe_calloc(payload_size);

 			/* Read and decompress the payload */
 			(void) sfread(cbuf, payload_size, stdin);
-			if (dinfo->ci_decompress != NULL) {
-				if (0 != dinfo->ci_decompress(cbuf, dbuf,
-				    payload_size, MIN(bufsz,
-				    drrw->drr_logical_size), dinfo->ci_level)) {
+			if (dtype != ZIO_COMPRESS_OFF) {
+				abd_t cabd, dabd;
+				abd_get_from_buf_struct(&cabd,
+				    cbuf, payload_size);
+				abd_get_from_buf_struct(&dabd, dbuf,
+				    MIN(bufsz, drrw->drr_logical_size));
+				if (zio_decompress_data(dtype, &cabd, &dabd,
+				    payload_size, abd_get_size(&dabd),
+				    NULL) != 0) {
 					warnx("decompression type %d failed "
 					    "for ino %llu offset %llu",
-					    type,
+					    dtype,
 					    (u_longlong_t)drrw->drr_object,
 					    (u_longlong_t)drrw->drr_offset);
 					exit(4);
 				}
 				payload_size = drrw->drr_logical_size;
+				abd_free(&dabd);
+				abd_free(&cabd);
 				free(cbuf);
 			}

 			/* Recompress the payload */
-			if (cinfo->ci_compress != NULL) {
-				payload_size = P2ROUNDUP(cinfo->ci_compress(
-				    dbuf, buf, drrw->drr_logical_size,
-				    MIN(payload_size, bufsz), (level == -1 ?
-				    cinfo->ci_level : level)),
-				    SPA_MINBLOCKSIZE);
-				if (payload_size != drrw->drr_logical_size) {
-					drrw->drr_compressiontype = type;
-					drrw->drr_compressed_size =
-					    payload_size;
-				} else {
+			if (ctype != ZIO_COMPRESS_OFF) {
+				abd_t dabd, abd;
+				abd_get_from_buf_struct(&dabd,
+				    dbuf, drrw->drr_logical_size);
+				abd_t *pabd =
+				    abd_get_from_buf_struct(&abd, buf, bufsz);
+				size_t csize = zio_compress_data(ctype, &dabd,
+				    &pabd, drrw->drr_logical_size, level);
+				size_t rounded =
+				    P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
+				if (rounded >= drrw->drr_logical_size) {
 					memcpy(buf, dbuf, payload_size);
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
+				} else {
+					abd_zero_off(pabd, csize,
+					    rounded - csize);
+					drrw->drr_compressiontype = ctype;
+					drrw->drr_compressed_size =
+					    payload_size = rounded;
 				}
+				abd_free(&abd);
+				abd_free(&dabd);
 				free(dbuf);
 			} else {
-				drrw->drr_compressiontype = type;
+				drrw->drr_compressiontype = 0;
 				drrw->drr_compressed_size = 0;
 			}
 			break;
@ -371,6 +374,7 @@ zstream_do_recompress(int argc, char *argv[])
 	fletcher_4_fini();
 	zio_fini();
 	zstd_fini();
+	abd_fini();

 	return (0);
 }
--- a/sys/contrib/openzfs/cmd/ztest.c
+++ b/sys/contrib/openzfs/cmd/ztest.c
@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
 extern unsigned long zfs_reconstruct_indirect_damage_fraction;
 extern uint64_t raidz_expand_max_reflow_bytes;
 extern uint_t raidz_expand_pause_point;
+extern boolean_t ddt_prune_artificial_age;
+extern boolean_t ddt_dump_prune_histogram;


 static ztest_shared_opts_t *ztest_shared_opts;
@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
 ztest_func_t ztest_fletcher_incr;
 ztest_func_t ztest_verify_dnode_bt;
 ztest_func_t ztest_pool_prefetch_ddt;
+ztest_func_t ztest_ddt_prune;

 static uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
 	ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
 	ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
 	ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
+	ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
 };

 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
@ -6747,7 +6751,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
 	load = spa_load_guid(spa);

 	(void) pthread_rwlock_wrlock(&ztest_name_lock);
-	error = spa_change_guid(spa);
+	error = spa_change_guid(spa, NULL);
 	zs->zs_guid = spa_guid(spa);
 	(void) pthread_rwlock_unlock(&ztest_name_lock);

@ -7289,6 +7293,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
 	mutex_exit(&ztest_vdev_lock);
 }

+void
+ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
+{
+	(void) zd, (void) id;
+
+	spa_t *spa = ztest_spa;
+	uint64_t pct = ztest_random(15) + 1;
+
+	(void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
+}
+
 /*
 * Verify pool integrity by running zdb.
 */
@ -7470,6 +7485,13 @@ ztest_resume_thread(void *arg)
 {
 	spa_t *spa = arg;

+	/*
+	 * Synthesize aged DDT entries for ddt prune testing
+	 */
+	ddt_prune_artificial_age = B_TRUE;
+	if (ztest_opts.zo_verbose >= 3)
+		ddt_dump_prune_histogram = B_TRUE;
+
 	while (!ztest_exiting) {
 		if (spa_suspended(spa))
 			ztest_resume(spa);
@ -8588,6 +8610,12 @@ ztest_init(ztest_shared_t *zs)
 		if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
 			continue;

+		/*
+		 * split 50/50 between legacy and fast dedup
+		 */
+		if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
+			continue;
+
 		VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
 		    spa_feature_table[i].fi_uname));
 		fnvlist_add_uint64(props, buf, 0);
--- a/sys/contrib/openzfs/config/Rules.am
+++ b/sys/contrib/openzfs/config/Rules.am
@ -10,7 +10,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/module/icp/include \
 	-I$(top_srcdir)/lib/libspl/include \
-	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@
+	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
+	-I$(top_srcdir)/lib/libzpool/include

 AM_LIBTOOLFLAGS = --silent

@ -70,4 +71,7 @@ KERNEL_CFLAGS       = $(FRAME_LARGER_THAN)
 LIBRARY_CFLAGS      = -no-suppress

 # Forcibly enable asserts/debugging for libzpool &al.
-FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
+# Since ZFS_DEBUG can change shared data structures, all libzpool users must
+# be compiled with the same flags.
+# See https://github.com/openzfs/zfs/issues/16476
+LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
--- a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install
@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
 usr/share/man/man8/zpool-create.8
 usr/share/man/man8/zpool-destroy.8
 usr/share/man/man8/zpool-detach.8
+usr/share/man/man8/zpool-ddtprune.8
 usr/share/man/man8/zpool-events.8
 usr/share/man/man8/zpool-export.8
 usr/share/man/man8/zpool-get.8
--- a/sys/contrib/openzfs/include/Makefile.am
+++ b/sys/contrib/openzfs/include/Makefile.am
@ -14,6 +14,7 @@ COMMON_H = \
 	zfs_fletcher.h \
 	zfs_namecheck.h \
 	zfs_prop.h \
+	zfs_valstr.h \
 	\
 	sys/abd.h \
 	sys/abd_impl.h \
--- a/sys/contrib/openzfs/include/libzfs.h
+++ b/sys/contrib/openzfs/include/libzfs.h
@ -300,10 +300,14 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,

 _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 _LIBZFS_H int zpool_reguid(zpool_handle_t *);
+_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
 _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);

 _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);

+_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
+    uint64_t);
+
 _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
    vdev_state_t *);
 _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
--- a/sys/contrib/openzfs/include/libzfs_core.h
+++ b/sys/contrib/openzfs/include/libzfs_core.h
@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);

 _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);

+_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
+    uint64_t);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/sys/contrib/openzfs/include/os/freebsd/Makefile.am
+++ b/sys/contrib/openzfs/include/os/freebsd/Makefile.am
@ -77,6 +77,8 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
 	%D%/zfs/sys/freebsd_event.h \
--- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_impl_os.h
+++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_impl_os.h
@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	critical_enter()
+#define	abd_exit_critical(flags)	critical_exit()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
--- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_os.h
+++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_os.h
@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	void		*abd_chunks[1]; /* actually variable-length */
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
--- a/sys/contrib/openzfs/include/os/linux/Makefile.am
+++ b/sys/contrib/openzfs/include/os/linux/Makefile.am
@ -20,6 +20,8 @@ kernel_linux_HEADERS = \

 kernel_sysdir = $(kerneldir)/sys
 kernel_sys_HEADERS = \
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/policy.h \
 	%D%/zfs/sys/trace_acl.h \
 	%D%/zfs/sys/trace_arc.h \
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h
@ -20,6 +20,10 @@
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */

 #ifndef _SPL_TASKQ_H
 #define	_SPL_TASKQ_H
@ -33,6 +37,9 @@
 #include <sys/thread.h>
 #include <sys/rwlock.h>
 #include <sys/wait.h>
+#include <sys/wmsum.h>
+
+typedef struct kstat_s kstat_t;

 #define	TASKQ_NAMELEN		31

@ -74,6 +81,32 @@ typedef enum tq_lock_role {
 typedef unsigned long taskqid_t;
 typedef void (task_func_t)(void *);

+typedef struct taskq_sums {
+	/* gauges (inc/dec counters, current value) */
+	wmsum_t tqs_threads_active;		/* threads running a task */
+	wmsum_t tqs_threads_idle;		/* threads waiting for work */
+	wmsum_t tqs_threads_total;		/* total threads */
+	wmsum_t tqs_tasks_pending;		/* tasks waiting to execute */
+	wmsum_t tqs_tasks_priority;		/* hi-pri tasks waiting */
+	wmsum_t tqs_tasks_total;		/* total waiting tasks */
+	wmsum_t tqs_tasks_delayed;		/* tasks deferred to future */
+	wmsum_t tqs_entries_free;		/* task entries on free list */
+
+	/* counters (inc only, since taskq creation) */
+	wmsum_t tqs_threads_created;		/* threads created */
+	wmsum_t tqs_threads_destroyed;		/* threads destroyed */
+	wmsum_t tqs_tasks_dispatched;		/* tasks dispatched */
+	wmsum_t tqs_tasks_dispatched_delayed;	/* tasks delayed to future */
+	wmsum_t tqs_tasks_executed_normal;	/* normal pri tasks executed */
+	wmsum_t tqs_tasks_executed_priority;	/* high pri tasks executed */
+	wmsum_t tqs_tasks_executed;		/* total tasks executed */
+	wmsum_t tqs_tasks_delayed_requeued;	/* delayed tasks requeued */
+	wmsum_t tqs_tasks_cancelled;		/* tasks cancelled before run */
+	wmsum_t tqs_thread_wakeups;		/* total thread wakeups */
+	wmsum_t tqs_thread_wakeups_nowork;	/* thread woken but no tasks */
+	wmsum_t tqs_thread_sleeps;		/* total thread sleeps */
+} taskq_sums_t;
+
 typedef struct taskq {
 	spinlock_t		tq_lock;	/* protects taskq_t */
 	char			*tq_name;	/* taskq name */
@ -105,6 +138,8 @@ typedef struct taskq {
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
 	unsigned long		lastspawnstop;	/* when to purge dynamic */
+	taskq_sums_t		tq_sums;
+	kstat_t			*tq_ksp;
 } taskq_t;

 typedef struct taskq_ent {
@ -123,6 +158,13 @@ typedef struct taskq_ent {
 #define	TQENT_FLAG_PREALLOC	0x1
 #define	TQENT_FLAG_CANCEL	0x2

+/* bits 2-3 are which list tqent is on */
+#define	TQENT_LIST_NONE		0x0
+#define	TQENT_LIST_PENDING	0x4
+#define	TQENT_LIST_PRIORITY	0x8
+#define	TQENT_LIST_DELAY	0xc
+#define	TQENT_LIST_MASK		0xc
+
 typedef struct taskq_thread {
 	struct list_head	tqt_thread_list;
 	struct list_head	tqt_active_list;
--- a/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_impl_os.h
+++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_impl_os.h
@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	local_irq_save(flags)
+#define	abd_exit_critical(flags)	local_irq_restore(flags)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
--- a/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_os.h
+++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_os.h
@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_nents;
+	struct scatterlist *abd_sgl;
+};
+
+struct abd_linear {
+	void		*abd_buf;
+	struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
+};
+
+typedef struct abd abd_t;
+
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+
+/*
+ * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
+ */
+unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
+unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
--- a/sys/contrib/openzfs/include/sys/abd.h
+++ b/sys/contrib/openzfs/include/sys/abd.h
@ -30,6 +30,7 @@
 #include <sys/debug.h>
 #include <sys/zfs_refcount.h>
 #include <sys/uio.h>
+#include <sys/abd_os.h>

 #ifdef __cplusplus
 extern "C" {
@ -44,8 +45,7 @@ typedef enum abd_flags {
 	ABD_FLAG_LINEAR_PAGE 	= 1 << 5, /* linear but allocd from page */
 	ABD_FLAG_GANG		= 1 << 6, /* mult ABDs chained together */
 	ABD_FLAG_GANG_FREE	= 1 << 7, /* gang ABD is responsible for mem */
-	ABD_FLAG_ZEROS		= 1 << 8, /* ABD for zero-filled buffer */
-	ABD_FLAG_ALLOCD		= 1 << 9, /* we allocated the abd_t */
+	ABD_FLAG_ALLOCD		= 1 << 8, /* we allocated the abd_t */
 } abd_flags_t;

 typedef struct abd {
@ -58,19 +58,8 @@ typedef struct abd {
 #endif
 	kmutex_t	abd_mtx;
 	union {
-		struct abd_scatter {
-			uint_t		abd_offset;
-#if defined(__FreeBSD__) && defined(_KERNEL)
-			void    *abd_chunks[1]; /* actually variable-length */
-#else
-			uint_t		abd_nents;
-			struct scatterlist *abd_sgl;
-#endif
-		} abd_scatter;
-		struct abd_linear {
-			void		*abd_buf;
-			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
-		} abd_linear;
+		struct abd_scatter	abd_scatter;
+		struct abd_linear	abd_linear;
 		struct abd_gang {
 			list_t abd_gang_chain;
 		} abd_gang;
@ -79,9 +68,6 @@ typedef struct abd {

 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
-#if defined(__linux__) && defined(_KERNEL)
-typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
-#endif

 extern int zfs_abd_scatter_enabled;

@ -107,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
 abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
 abd_t *abd_get_zeros(size_t);
 abd_t *abd_get_from_buf(void *, size_t);
+abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
 void abd_cache_reap_now(void);

 /*
@ -128,10 +115,6 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
    abd_iter_func2_t *, void *);
-#if defined(__linux__) && defined(_KERNEL)
-int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
-    void *);
-#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@ -225,16 +208,6 @@ abd_get_size(abd_t *abd)
 void abd_init(void);
 void abd_fini(void);

-/*
- * Linux ABD bio functions
- * Note: these are only needed to support vdev_classic. See comment in
- * vdev_disk.c.
- */
-#if defined(__linux__) && defined(_KERNEL)
-unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
-unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
-#endif
-
 #ifdef __cplusplus
 }
 #endif
--- a/sys/contrib/openzfs/include/sys/abd_impl.h
+++ b/sys/contrib/openzfs/include/sys/abd_impl.h
@ -28,6 +28,7 @@
 #define	_ABD_IMPL_H

 #include <sys/abd.h>
+#include <sys/abd_impl_os.h>
 #include <sys/wmsum.h>

 #ifdef __cplusplus
@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
 #define	ABD_LINEAR_BUF(abd)	(abd->abd_u.abd_linear.abd_buf)
 #define	ABD_GANG(abd)		(abd->abd_u.abd_gang)

-#if defined(_KERNEL)
-#if defined(__FreeBSD__)
-#define	abd_enter_critical(flags)	critical_enter()
-#define	abd_exit_critical(flags)	critical_exit()
-#else
-#define	abd_enter_critical(flags)	local_irq_save(flags)
-#define	abd_exit_critical(flags)	local_irq_restore(flags)
-#endif
-#else /* !_KERNEL */
-#define	abd_enter_critical(flags)	((void)0)
-#define	abd_exit_critical(flags)	((void)0)
-#endif
-
 #ifdef __cplusplus
 }
 #endif
--- a/sys/contrib/openzfs/include/sys/ddt.h
+++ b/sys/contrib/openzfs/include/sys/ddt.h
@ -39,6 +39,13 @@ extern "C" {

 struct abd;

+/*
+ * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
+ */
+#define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
+#define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)
+
 /*
 * DDT on-disk storage object types. Each one corresponds to specific
 * implementation, see ddt_ops_t. The value itself is not stored on disk.
@ -120,30 +127,80 @@ typedef struct {
 * characteristics of the stored block, such as its location on disk (DVAs),
 * birth txg and ref count.
 *
- * Note that an entry has an array of four ddt_phys_t, one for each number of
- * DVAs (copies= property) and another for additional "ditto" copies. Most
- * users of ddt_phys_t will handle indexing into or counting the phys they
- * want.
+ * The "traditional" entry has an array of four, one for each number of DVAs
+ * (copies= property) and another for additional "ditto" copies. Users of the
+ * traditional struct will specify the variant (index) of the one they want.
+ *
+ * The newer "flat" entry has only a single form that is specified using the
+ * DDT_PHYS_FLAT variant.
+ *
+ * Since the value size varies, use one of the size macros when interfacing
+ * with the ddt zap.
 */
-typedef struct {
-	dva_t		ddp_dva[SPA_DVAS_PER_BP];
-	uint64_t	ddp_refcnt;
-	uint64_t	ddp_phys_birth;
-} ddt_phys_t;
+
+#define	DDT_PHYS_MAX	(4)

 /*
- * Named indexes into the ddt_phys_t array in each entry.
+ * Note - this can be used in a flexible array and allocated for
+ * a specific size (ddp_trad or ddp_flat). So be careful not to
+ * copy using "=" assignment but instead use ddt_phys_copy().
+ */
+typedef union {
+	/*
+	 * Traditional physical payload value for DDT zap (256 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth;
+	} ddp_trad[DDT_PHYS_MAX];
+
+	/*
+	 * Flat physical payload value for DDT zap (72 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth; /* txg based from BP */
+		uint64_t	ddp_class_start; /* in realtime seconds */
+	} ddp_flat;
+} ddt_univ_phys_t;
+
+/*
+ * This enum denotes which variant of a ddt_univ_phys_t to target. For
+ * a traditional DDT entry, it represents the indexes into the ddp_trad
+ * array. Any consumer of a ddt_univ_phys_t needs to know which variant
+ * is being targeted.
 *
 * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
 * we maintain the ability to free existing dedup-ditto blocks.
 */
-enum ddt_phys_type {
+
+typedef enum {
 	DDT_PHYS_DITTO = 0,
 	DDT_PHYS_SINGLE = 1,
 	DDT_PHYS_DOUBLE = 2,
 	DDT_PHYS_TRIPLE = 3,
-	DDT_PHYS_TYPES
-};
+	DDT_PHYS_FLAT = 4,
+	DDT_PHYS_NONE = 5
+} ddt_phys_variant_t;
+
+#define	DDT_PHYS_VARIANT(ddt, p)	\
+	(ASSERT((p) < DDT_PHYS_NONE),	\
+	((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
+
+#define	DDT_TRAD_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
+#define	DDT_FLAT_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
+
+#define	_DDT_PHYS_SWITCH(ddt, flat, trad)	\
+	(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
+
+#define	DDT_PHYS_SIZE(ddt)		_DDT_PHYS_SWITCH(ddt,	\
+	DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_NPHYS(ddt)			_DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
+#define	DDT_PHYS_FOR_COPIES(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, p)
+#define	DDT_PHYS_IS_DITTO(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, (p == 0))

 /*
 * A "live" entry, holding changes to an entry made this txg, and other data to
@ -153,17 +210,27 @@ enum ddt_phys_type {
 /* State flags for dde_flags */
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
+#define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */
+
+/*
+ * Additional data to support entry update or repair. This is fixed size
+ * because its relatively rarely used.
+ */
+typedef struct {
+	/* copy of data after a repair read, to be rewritten */
+	abd_t		*dde_repair_abd;
+
+	/* original phys contents before update, for error handling */
+	ddt_univ_phys_t	dde_orig_phys;
+
+	/* in-flight update IOs */
+	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
+} ddt_entry_io_t;

 typedef struct {
 	/* key must be first for ddt_key_compare */
-	ddt_key_t	dde_key;			/* ddt_tree key */
-	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];	/* on-disk data */
-
-	/* in-flight update IOs */
-	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
-
-	/* copy of data after a repair read, to be rewritten */
-	struct abd	*dde_repair_abd;
+	ddt_key_t	dde_key;	/* ddt_tree key */
+	avl_node_t	dde_node;	/* ddt_tree_node */

 	/* storage type and class the entry was loaded from */
 	ddt_type_t	dde_type;
@ -173,9 +240,35 @@ typedef struct {
 	kcondvar_t	dde_cv;		/* signaled when load completes */
 	uint64_t	dde_waiters;	/* count of waiters on dde_cv */

-	avl_node_t	dde_node;	/* ddt_tree node */
+	ddt_entry_io_t	*dde_io;	/* IO support, when required */
+
+	ddt_univ_phys_t	dde_phys[];	/* flexible -- allocated size varies */
 } ddt_entry_t;

+/*
+ * A lightweight entry is for short-lived or transient uses, like iterating or
+ * inspecting, when you don't care where it came from.
+ */
+typedef struct {
+	ddt_key_t	ddlwe_key;
+	ddt_type_t	ddlwe_type;
+	ddt_class_t	ddlwe_class;
+	ddt_univ_phys_t	ddlwe_phys;
+} ddt_lightweight_entry_t;
+
+/*
+ * In-core DDT log. A separate struct to make it easier to switch between the
+ * appending and flushing logs.
+ */
+typedef struct {
+	avl_tree_t	ddl_tree;	/* logged entries */
+	uint32_t	ddl_flags;	/* flags for this log */
+	uint64_t	ddl_object;	/* log object id */
+	uint64_t	ddl_length;	/* on-disk log size */
+	uint64_t	ddl_first_txg;	/* txg log became active */
+	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
+} ddt_log_t;
+
 /*
 * In-core DDT object. This covers all entries and stats for a the whole pool
 * for a given checksum type.
@ -184,23 +277,49 @@ typedef struct {
 	kmutex_t	ddt_lock;	/* protects changes to all fields */

 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
+	avl_tree_t	ddt_log_tree;	/* logged entries */

 	avl_tree_t	ddt_repair_tree;	/* entries being repaired */

-	enum zio_checksum ddt_checksum;		/* checksum algorithm in use */
-	spa_t		*ddt_spa;		/* pool this ddt is on */
-	objset_t	*ddt_os;		/* ddt objset (always MOS) */
+	ddt_log_t	ddt_log[2];		/* active/flushing logs */
+	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
+	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
+
+	hrtime_t	ddt_flush_start;	/* log flush start this txg */
+	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
+
+	int32_t		ddt_flush_count;	/* entries flushed this txg */
+	int32_t		ddt_flush_min;		/* min rem entries to flush */
+	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
+	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
+	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
+
+	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
+
+	kstat_t		*ddt_ksp;	/* kstats context */
+
+	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
+	spa_t		*ddt_spa;	/* pool this ddt is on */
+	objset_t	*ddt_os;	/* ddt objset (always MOS) */
+
+	uint64_t	ddt_dir_object;	/* MOS dir holding ddt objects */
+	uint64_t	ddt_version;	/* DDT version */
+	uint64_t	ddt_flags;	/* FDT option flags */

 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];

-	/* object ids for whole-ddt and per-type/per-class stats */
+	/* object ids for stored, logged and per-type/per-class stats */
 	uint64_t	ddt_stat_object;
+	ddt_object_t	ddt_log_stats;
 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];

 	/* type/class stats by power-2-sized referenced blocks */
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+
+	/* log stats power-2-sized referenced blocks */
+	ddt_histogram_t	ddt_log_histogram;
 } ddt_t;

 /*
@ -215,20 +334,36 @@ typedef struct {
 	uint64_t	ddb_cursor;
 } ddt_bookmark_t;

-extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
-    uint64_t txg);
+extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    blkptr_t *bp, uint64_t txg);
 extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
-    const ddt_phys_t *ddp, blkptr_t *bp);
+    const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);

-extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
-extern void ddt_phys_clear(ddt_phys_t *ddp);
-extern void ddt_phys_addref(ddt_phys_t *ddp);
-extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    const blkptr_t *bp);
+extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+    ddt_phys_variant_t v);
+extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
+    const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    boolean_t encrypted);
+
+extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);
+extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);

 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
-extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
 extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+
 extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
 extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
 extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
@ -243,7 +378,7 @@ extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
 extern void ddt_fini(void);
-extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
 extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
 extern void ddt_prefetch_all(spa_t *spa);
@ -251,6 +386,8 @@ extern void ddt_prefetch_all(spa_t *spa);
 extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
    const blkptr_t *bp);

+extern void ddt_alloc_entry_io(ddt_entry_t *dde);
+
 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);

@ -260,10 +397,17 @@ extern void ddt_create(spa_t *spa);
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
-extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+
+extern void ddt_walk_init(spa_t *spa, uint64_t txg);
+extern boolean_t ddt_walk_ready(spa_t *spa);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
+    ddt_lightweight_entry_t *ddlwe);

 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

+extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
+    uint64_t amount);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/sys/contrib/openzfs/include/sys/ddt_impl.h
+++ b/sys/contrib/openzfs/include/sys/ddt_impl.h
@ -28,11 +28,132 @@
 #define	_SYS_DDT_IMPL_H

 #include <sys/ddt.h>
+#include <sys/bitops.h>

 #ifdef	__cplusplus
 extern "C" {
 #endif

+/* DDT version numbers */
+#define	DDT_VERSION_LEGACY		(0)
+#define	DDT_VERSION_FDT			(1)
+
+/* Dummy version to signal that configure is still necessary */
+#define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
+
+/* Names of interesting objects in the DDT root dir */
+#define	DDT_DIR_VERSION		"version"
+#define	DDT_DIR_FLAGS		"flags"
+
+/* Fill a lightweight entry from a live entry. */
+#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {			\
+	memset((ddlwe), 0, sizeof (*ddlwe));				\
+	(ddlwe)->ddlwe_key = (dde)->dde_key;				\
+	(ddlwe)->ddlwe_type = (dde)->dde_type;				\
+	(ddlwe)->ddlwe_class = (dde)->dde_class;			\
+	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+#define	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do {             \
+	memset((ddlwe), 0, sizeof (*ddlwe));                            \
+	(ddlwe)->ddlwe_key = (ddle)->ddle_key;                          \
+	(ddlwe)->ddlwe_type = (ddle)->ddle_type;                        \
+	(ddlwe)->ddlwe_class = (ddle)->ddle_class;                      \
+	memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+/*
+ * An entry on the log tree. These are "frozen", and a record of what's in
+ * the on-disk log. They can't be used in place, but can be "loaded" back into
+ * the live tree.
+ */
+typedef struct {
+	ddt_key_t	ddle_key;	/* ddt_log_tree key */
+	avl_node_t	ddle_node;	/* ddt_log_tree node */
+
+	ddt_type_t	ddle_type;	/* storage type */
+	ddt_class_t	ddle_class;	/* storage class */
+
+	/* extra allocation for flat/trad phys */
+	ddt_univ_phys_t	ddle_phys[];
+} ddt_log_entry_t;
+
+/* On-disk log record types. */
+typedef enum {
+	DLR_INVALID	= 0,	/* end of block marker */
+	DLR_ENTRY	= 1,	/* an entry to add or replace in the log tree */
+} ddt_log_record_type_t;
+
+/* On-disk log record header. */
+typedef struct {
+	/*
+	 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:    record type (ddt_log_record_type_t)
+	 * bits 8-15:  length of record header+payload
+	 * bits 16-47:  reserved, all zero
+	 * bits 48-55:   if type==DLR_ENTRY, storage type (ddt_type)
+	 *                otherwise all zero
+	 * bits 56-63:  if type==DLR_ENTRY, storage class (ddt_class)
+	 *                otherwise all zero
+	 */
+	uint64_t	dlr_info;
+	uint8_t		dlr_payload[];
+} ddt_log_record_t;
+
+#define	DLR_GET_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 0, 8)
+#define	DLR_SET_TYPE(dlr, v)		BF64_SET((dlr)->dlr_info, 0, 8, v)
+#define	DLR_GET_RECLEN(dlr)		BF64_GET((dlr)->dlr_info, 8, 16)
+#define	DLR_SET_RECLEN(dlr, v)		BF64_SET((dlr)->dlr_info, 8, 16, v)
+#define	DLR_GET_ENTRY_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 48, 8)
+#define	DLR_SET_ENTRY_TYPE(dlr, v)	BF64_SET((dlr)->dlr_info, 48, 8, v)
+#define	DLR_GET_ENTRY_CLASS(dlr)	BF64_GET((dlr)->dlr_info, 56, 8)
+#define	DLR_SET_ENTRY_CLASS(dlr, v)	BF64_SET((dlr)->dlr_info, 56, 8, v)
+
+/* Payload for DLR_ENTRY. */
+typedef struct {
+	ddt_key_t	dlre_key;
+	ddt_univ_phys_t	dlre_phys[];
+} ddt_log_record_entry_t;
+
+/* Log flags (ddl_flags, dlh_flags) */
+#define	DDL_FLAG_FLUSHING	(1 << 0)	/* this log is being flushed */
+#define	DDL_FLAG_CHECKPOINT	(1 << 1)	/* header has a checkpoint */
+
+/* On-disk log header, stored in the bonus buffer. */
+typedef struct {
+	/*
+	 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:   log version
+	 * bits 8-15:  log flags
+	 * bits 16-63: reserved, all zero
+	 */
+	uint64_t	dlh_info;
+
+	uint64_t	dlh_length;	/* log size in bytes */
+	uint64_t	dlh_first_txg;	/* txg this log went active */
+	ddt_key_t	dlh_checkpoint;	/* last checkpoint */
+} ddt_log_header_t;
+
+#define	DLH_GET_VERSION(dlh)	BF64_GET((dlh)->dlh_info, 0, 8)
+#define	DLH_SET_VERSION(dlh, v)	BF64_SET((dlh)->dlh_info, 0, 8, v)
+#define	DLH_GET_FLAGS(dlh)	BF64_GET((dlh)->dlh_info, 8, 8)
+#define	DLH_SET_FLAGS(dlh, v)	BF64_SET((dlh)->dlh_info, 8, 8, v)
+
+/* DDT log update state */
+typedef struct {
+	dmu_tx_t	*dlu_tx;	/* tx the update is being applied to */
+	dnode_t		*dlu_dn;	/* log object dnode */
+	dmu_buf_t	**dlu_dbp;	/* array of block buffer pointers */
+	int		dlu_ndbp;	/* number of block buffer pointers */
+	uint16_t	dlu_reclen;	/* cached length of record */
+	uint64_t	dlu_block;	/* block for next entry */
+	uint64_t	dlu_offset;	/* offset for next entry */
+} ddt_log_update_t;
+
 /*
 * Ops vector to access a specific DDT object type.
 */
@ -42,25 +163,53 @@ typedef struct {
 	    boolean_t prehash);
 	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
 	int (*ddt_op_lookup)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    const ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_contains)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
 	int (*ddt_op_update)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
+	    const ddt_key_t *ddk, const void *phys, size_t psize,
 	    dmu_tx_t *tx);
 	int (*ddt_op_remove)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk, dmu_tx_t *tx);
 	int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
-	    ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
 } ddt_ops_t;

 extern const ddt_ops_t ddt_zap_ops;

-extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
+/* Dedup log API */
+extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
+    ddt_log_update_t *dlu);
+extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
+    ddt_log_update_t *dlu);
+extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
+
+extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
+    ddt_lightweight_entry_t *ddlwe);
+
+extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk);
+
+extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    dmu_tx_t *tx);
+extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
+
+extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
+
+extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
+
+extern int ddt_log_load(ddt_t *ddt);
+extern void ddt_log_alloc(ddt_t *ddt);
+extern void ddt_log_free(ddt_t *ddt);
+
+extern void ddt_log_init(void);
+extern void ddt_log_fini(void);

 /*
 * These are only exposed so that zdb can access them. Try not to use them
@ -68,22 +217,59 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
 * them up.
 */

+/*
+ * We use a histogram to convert a percentage request into a
+ * cutoff value where entries older than the cutoff get pruned.
+ *
+ * The histogram bins represent hours in power-of-two increments.
+ * 16 bins covers up to four years.
+ */
+#define	HIST_BINS 16
+
+typedef struct ddt_age_histo {
+	uint64_t dah_entries;
+	uint64_t dah_age_histo[HIST_BINS];
+} ddt_age_histo_t;
+
+void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);
+
+#if defined(_KERNEL) || !defined(ZFS_DEBUG)
+#define	ddt_dump_age_histogram(histo, cutoff)	((void)0)
+#else
+static inline void
+ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
+{
+	if (histogram->dah_entries == 0)
+		return;
+
+	(void) printf("DDT prune unique class age, %llu hour cutoff\n",
+	    (u_longlong_t)(gethrestime_sec() - cutoff)/3600);
+	(void) printf("%5s  %9s  %4s\n", "age", "blocks", "amnt");
+	(void) printf("%5s  %9s  %4s\n", "-----", "---------", "----");
+	for (int i = 0; i < HIST_BINS; i++) {
+		(void) printf("%5d  %9llu %4d%%\n", 1<<i,
+		    (u_longlong_t)histogram->dah_age_histo[i],
+		    (int)((histogram->dah_age_histo[i] * 100) /
+		    histogram->dah_entries));
+	}
+}
+#endif
+
 /*
 * Enough room to expand DMU_POOL_DDT format for all possible DDT
 * checksum/class/type combinations.
 */
 #define	DDT_NAMELEN	32

-extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
+    const ddt_univ_phys_t *ddp);

 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);

-extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
-
 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
    char *name);
 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
-    uint64_t *walk, ddt_entry_t *dde);
+    uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
 extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
    uint64_t *count);
 extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
--- a/sys/contrib/openzfs/include/sys/dmu.h
+++ b/sys/contrib/openzfs/include/sys/dmu.h
@ -375,7 +375,9 @@ typedef struct dmu_buf {
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
+#define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
--- a/sys/contrib/openzfs/include/sys/dsl_scan.h
+++ b/sys/contrib/openzfs/include/sys/dsl_scan.h
@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
 void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx);
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
 void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
--- a/sys/contrib/openzfs/include/sys/fs/zfs.h
+++ b/sys/contrib/openzfs/include/sys/fs/zfs.h
@ -1422,7 +1422,7 @@ typedef enum {
 */
 typedef enum zfs_ioc {
 	/*
-	 * Core features - 88/128 numbers reserved.
+	 * Core features - 89/128 numbers reserved.
 	 */
 #ifdef __FreeBSD__
 	ZFS_IOC_FIRST =	0,
@ -1519,6 +1519,7 @@ typedef enum zfs_ioc {
 	ZFS_IOC_VDEV_SET_PROPS,			/* 0x5a56 */
 	ZFS_IOC_POOL_SCRUB,			/* 0x5a57 */
 	ZFS_IOC_POOL_PREFETCH,			/* 0x5a58 */
+	ZFS_IOC_DDT_PRUNE,			/* 0x5a59 */

 	/*
 	 * Per-platform (Optional) - 8/128 numbers reserved.
@ -1655,6 +1656,12 @@ typedef enum {
 	ZPOOL_PREFETCH_DDT
 } zpool_prefetch_type_t;

+typedef enum {
+	ZPOOL_DDT_PRUNE_NONE,
+	ZPOOL_DDT_PRUNE_AGE,		/* in seconds */
+	ZPOOL_DDT_PRUNE_PERCENTAGE,	/* 1 - 100 */
+} zpool_ddt_prune_unit_t;
+
 /*
 * Bookmark name values.
 */
@ -1710,6 +1717,11 @@ typedef enum {
 #define	ZPOOL_INITIALIZE_COMMAND	"initialize_command"
 #define	ZPOOL_INITIALIZE_VDEVS		"initialize_vdevs"

+/*
+ * The following are names used when invoking ZFS_IOC_POOL_REGUID.
+ */
+#define	ZPOOL_REGUID_GUID	"guid"
+
 /*
 * The following are names used when invoking ZFS_IOC_POOL_TRIM.
 */
@ -1748,6 +1760,12 @@ typedef enum {
 */
 #define	ZPOOL_PREFETCH_TYPE		"prefetch_type"

+/*
+ * The following are names used when invoking ZFS_IOC_DDT_PRUNE.
+ */
+#define	DDT_PRUNE_UNIT		"ddt_prune_unit"
+#define	DDT_PRUNE_AMOUNT	"ddt_prune_amount"
+
 /*
 * Flags for ZFS_IOC_VDEV_SET_STATE
 */
--- a/sys/contrib/openzfs/include/sys/spa.h
+++ b/sys/contrib/openzfs/include/sys/spa.h
@ -572,7 +572,7 @@ typedef struct blkptr {
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))

-#define	BP_ZERO(bp)				\
+#define	BP_ZERO_DVAS(bp)			\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
@ -580,6 +580,11 @@ typedef struct blkptr {
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
+}
+
+#define	BP_ZERO(bp)				\
+{						\
+	BP_ZERO_DVAS(bp);			\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
@ -1087,7 +1092,7 @@ extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
-extern int spa_change_guid(spa_t *spa);
+extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
--- a/sys/contrib/openzfs/include/sys/spa_impl.h
+++ b/sys/contrib/openzfs/include/sys/spa_impl.h
@ -412,6 +412,7 @@ struct spa {
 	uint64_t	spa_dedup_dspace;	/* Cache get_dedup_dspace() */
 	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
 	uint64_t	spa_dspace;		/* dspace in normal class */
+	boolean_t	spa_active_ddt_prune;	/* ddt prune process active */
 	struct brt	*spa_brt;		/* in-core BRT */
 	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
 	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
--- a/sys/contrib/openzfs/include/sys/zio.h
+++ b/sys/contrib/openzfs/include/sys/zio.h
@ -167,6 +167,9 @@ typedef enum zio_suspend_reason {
 * This was originally an enum type. However, those are 32-bit and there is no
 * way to make a 64-bit enum type. Since we ran out of bits for flags, we were
 * forced to upgrade it to a uint64_t.
+ *
+ * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
+ * FLAG.
 */
 typedef uint64_t zio_flag_t;
 	/*
--- a/sys/contrib/openzfs/include/sys/zio_compress.h
+++ b/sys/contrib/openzfs/include/sys/zio_compress.h
@ -22,7 +22,7 @@
 /*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Copyright (c) 2019, Allan Jude
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
 * Use is subject to license terms.
 * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
 */
@ -122,25 +122,15 @@ enum zio_zstd_levels {
 struct zio_prop;

 /* Common signature for all zio compress functions. */
-typedef size_t zio_compress_func_t(void *src, void *dst,
+typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress functions. */
-typedef int zio_decompress_func_t(void *src, void *dst,
+typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress and get level functions. */
-typedef int zio_decompresslevel_func_t(void *src, void *dst,
+typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
    size_t s_len, size_t d_len, uint8_t *level);
-/* Common signature for all zio get-compression-level functions. */
-typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);

-
-/*
- * Common signature for all zio decompress functions using an ABD as input.
- * This is helpful if you have both compressed ARC and scatter ABDs enabled,
- * but is not a requirement for all compression algorithms.
- */
-typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
-    size_t s_len, size_t d_len, int);
 /*
 * Information about each compression function.
 */
@ -163,34 +153,66 @@ extern void lz4_fini(void);
 /*
 * Compression routines.
 */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
+extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
+    size_t d_len, int level);

 /*
 * Compress and decompress data if necessary.
 */
-extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
    size_t s_len, uint8_t level);
-extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level);
-extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
    size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);

+#define	ZFS_COMPRESS_WRAP_DECL(name)					\
+size_t									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n);	\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (c_len);							\
+}
+#define	ZFS_DECOMPRESS_WRAP_DECL(name)					\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+#define	ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name)				\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n)	\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+
 #ifdef	__cplusplus
 }
 #endif
--- a/sys/contrib/openzfs/include/sys/zio_impl.h
+++ b/sys/contrib/openzfs/include/sys/zio_impl.h
@ -120,6 +120,9 @@ extern "C" {

 /*
 * zio pipeline stage definitions
+ *
+ * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
+ * FLAG.
 */
 enum zio_stage {
 	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCXT */
--- a/sys/contrib/openzfs/include/sys/zio_priority.h
+++ b/sys/contrib/openzfs/include/sys/zio_priority.h
@ -22,6 +22,10 @@
 extern "C" {
 #endif

+/*
+ * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
+ * VALUE.
+ */
 typedef enum zio_priority {
 	ZIO_PRIORITY_SYNC_READ,
 	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
--- a/sys/contrib/openzfs/include/sys/zstd/zstd.h
+++ b/sys/contrib/openzfs/include/sys/zstd/zstd.h
@ -90,14 +90,12 @@ typedef struct zfs_zstd_meta {
 int zstd_init(void);
 void zstd_fini(void);

-size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
-    size_t d_len, int level);
-size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
+size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
    size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
-int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
    size_t d_len, uint8_t *level);
-int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
    size_t d_len, int n);
 void zfs_zstd_cache_reap_now(void);

--- a/sys/contrib/openzfs/include/zfeature_common.h
+++ b/sys/contrib/openzfs/include/zfeature_common.h
@ -82,6 +82,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_AVZ_V2,
 	SPA_FEATURE_REDACTION_LIST_SPILL,
 	SPA_FEATURE_RAIDZ_EXPANSION,
+	SPA_FEATURE_FAST_DEDUP,
 	SPA_FEATURES
 } spa_feature_t;

--- a/sys/contrib/openzfs/include/zfs_valstr.h
+++ b/sys/contrib/openzfs/include/zfs_valstr.h
@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2024, Klara Inc.
+ */
+
+#ifndef	_ZFS_VALSTR_H
+#define	_ZFS_VALSTR_H extern __attribute__((visibility("default")))
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * These macros create function prototypes for pretty-printing or stringifying
+ * certain kinds of numeric types.
+ *
+ * _ZFS_VALSTR_DECLARE_BITFIELD(name) creates:
+ *
+ *   size_t zfs_valstr_<name>_bits(uint64_t bits, char *out, size_t outlen);
+ *     expands single char for each set bit, and space for each clear bit
+ *
+ *   size_t zfs_valstr_<name>_pairs(uint64_t bits, char *out, size_t outlen);
+ *     expands two-char mnemonic for each bit set in `bits`, separated by `|`
+ *
+ *   size_t zfs_valstr_<name>(uint64_t bits, char *out, size_t outlen);
+ *     expands full name of each bit set in `bits`, separated by spaces
+ *
+ * _ZFS_VALSTR_DECLARE_ENUM(name) creates:
+ *
+ *   size_t zfs_valstr_<name>(int v, char *out, size_t outlen);
+ *     expands full name of enum value
+ *
+ * Each _ZFS_VALSTR_DECLARE_xxx needs a corresponding _VALSTR_xxx_IMPL string
+ * table in vfs_valstr.c.
+ */
+
+#define	_ZFS_VALSTR_DECLARE_BITFIELD(name)			\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _bits(	\
+	    uint64_t bits, char *out, size_t outlen);		\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _pairs(	\
+	    uint64_t bits, char *out, size_t outlen);		\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name(		\
+	    uint64_t bits, char *out, size_t outlen);		\
+
+#define	_ZFS_VALSTR_DECLARE_ENUM(name)				\
+	_ZFS_VALSTR_H size_t zfs_valstr_ ## name(		\
+	    int v, char *out, size_t outlen);			\
+
+_ZFS_VALSTR_DECLARE_BITFIELD(zio_flag)
+_ZFS_VALSTR_DECLARE_BITFIELD(zio_stage)
+
+_ZFS_VALSTR_DECLARE_ENUM(zio_priority)
+
+#undef _ZFS_VALSTR_DECLARE_BITFIELD
+#undef _ZFS_VALSTR_DECLARE_ENUM
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_VALSTR_H */
--- a/sys/contrib/openzfs/lib/libzfs/Makefile.am
+++ b/sys/contrib/openzfs/lib/libzfs/Makefile.am
@ -47,6 +47,7 @@ nodist_libzfs_la_SOURCES = \
 	module/zcommon/zfs_fletcher_superscalar4.c \
 	module/zcommon/zfs_namecheck.c \
 	module/zcommon/zfs_prop.c \
+	module/zcommon/zfs_valstr.c \
 	module/zcommon/zpool_prop.c \
 	module/zcommon/zprop_common.c

--- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi
@ -183,8 +183,8 @@
    <elf-symbol name='fsleep' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='get_dataset_depth' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='get_system_hostid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='get_timestamp' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='getextmntent' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='getmntany' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='getprop_uint64' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -454,6 +454,13 @@
    <elf-symbol name='zfs_userns' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_userspace' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_valid_proplist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_flag' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_flag_bits' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_flag_pairs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_priority' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_stage' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_stage_bits' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zfs_valstr_zio_stage_pairs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_version_kernel' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_version_nvlist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_version_print' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -466,7 +473,9 @@
    <elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_default_search_paths' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_disable_datasets' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -485,8 +494,8 @@
    <elf-symbol name='zpool_export_force' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_feature_init' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_find_config' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_find_parent_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_find_vdev_by_physpath' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_free_handles' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_get_all_vdev_props' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -529,7 +538,6 @@
    <elf-symbol name='zpool_prefetch' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prepare_and_label_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prepare_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prop_align_right' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prop_column_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_prop_default_numeric' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -556,6 +564,7 @@
    <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -616,7 +625,7 @@
    <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -5928,6 +5937,7 @@
      <enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
      <enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
      <enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
+      <enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
      <enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
      <enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
      <enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@ -5962,6 +5972,13 @@
      <enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
    </enum-decl>
    <typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
+    <enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
+    </enum-decl>
+    <typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
    <enum-decl name='spa_feature' id='33ecb627'>
      <underlying-type type-id='9cac1fee'/>
      <enumerator name='SPA_FEATURE_NONE' value='-1'/>
@ -6006,7 +6023,8 @@
      <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
      <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
      <enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
-      <enumerator name='SPA_FEATURES' value='41'/>
+      <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
+      <enumerator name='SPA_FEATURES' value='42'/>
    </enum-decl>
    <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
    <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@ -6137,6 +6155,12 @@
      <parameter type-id='857bb57e'/>
      <return type-id='95e97e5e'/>
    </function-decl>
+    <function-decl name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='80f4b756'/>
+      <parameter type-id='02e25ab0'/>
+      <parameter type-id='9c313c2d'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-decl name='zfs_resolve_shortname' mangled-name='zfs_resolve_shortname' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_resolve_shortname'>
      <parameter type-id='80f4b756'/>
      <parameter type-id='26a90f95'/>
@ -6638,6 +6662,11 @@
      <parameter type-id='9c313c2d' name='guid'/>
      <return type-id='95e97e5e'/>
    </function-decl>
+    <function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='713a56f5' name='guid'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
      <parameter type-id='4c81de99' name='zhp'/>
      <return type-id='95e97e5e'/>
@ -6791,6 +6820,12 @@
      <parameter type-id='80f4b756' name='propval'/>
      <return type-id='95e97e5e'/>
    </function-decl>
+    <function-decl name='zpool_ddt_prune' mangled-name='zpool_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_ddt_prune'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='02e25ab0' name='unit'/>
+      <parameter type-id='9c313c2d' name='amount'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
  </abi-instr>
  <abi-instr address-size='64' path='lib/libzfs/libzfs_sendrecv.c' language='LANG_C99'>
    <array-type-def dimensions='1' type-id='8901473c' size-in-bits='576' id='f5da478b'>
@ -7830,7 +7865,7 @@
      </data-member>
    </class-decl>
    <typedef-decl name='vdev_cbdata_t' type-id='b8006be8' id='a9679c94'/>
-    <class-decl name='zprop_get_cbdata' size-in-bits='832' is-struct='yes' visibility='default' id='f3d3c319'>
+    <class-decl name='zprop_get_cbdata' size-in-bits='960' is-struct='yes' visibility='default' id='f3d3c319'>
      <data-member access='public' layout-offset-in-bits='0'>
        <var-decl name='cb_sources' type-id='95e97e5e' visibility='default'/>
      </data-member>
@ -7849,6 +7884,9 @@
      <data-member access='public' layout-offset-in-bits='448'>
        <var-decl name='cb_first' type-id='c19b74c3' visibility='default'/>
      </data-member>
+      <data-member access='public' layout-offset-in-bits='480'>
+        <var-decl name='cb_json' type-id='c19b74c3' visibility='default'/>
+      </data-member>
      <data-member access='public' layout-offset-in-bits='512'>
        <var-decl name='cb_proplist' type-id='3a9b2288' visibility='default'/>
      </data-member>
@ -7858,6 +7896,15 @@
      <data-member access='public' layout-offset-in-bits='640'>
        <var-decl name='cb_vdevs' type-id='a9679c94' visibility='default'/>
      </data-member>
+      <data-member access='public' layout-offset-in-bits='832'>
+        <var-decl name='cb_jsobj' type-id='5ce45b60' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='cb_json_as_int' type-id='c19b74c3' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='928'>
+        <var-decl name='cb_json_pool_key_guid' type-id='c19b74c3' visibility='default'/>
+      </data-member>
    </class-decl>
    <typedef-decl name='zprop_get_cbdata_t' type-id='f3d3c319' id='f3d87113'/>
    <typedef-decl name='zprop_func' type-id='2e711a2a' id='1ec3747a'/>
@ -7961,6 +8008,11 @@
    <qualified-type-def type-id='d33f11cb' restrict='yes' id='5c53ba29'/>
    <pointer-type-def type-id='ffa52b96' size-in-bits='64' id='76c8174b'/>
    <pointer-type-def type-id='f3d87113' size-in-bits='64' id='0d2a0670'/>
+    <function-decl name='nvlist_print_json' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='822cd80b'/>
+      <parameter type-id='5ce45b60'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-decl name='zpool_label_disk' mangled-name='zpool_label_disk' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_label_disk'>
      <parameter type-id='b0382bb3'/>
      <parameter type-id='4c81de99'/>
@ -8068,6 +8120,11 @@
      <parameter type-id='d33f11cb'/>
      <return type-id='48b5725f'/>
    </function-decl>
+    <function-decl name='putc' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='95e97e5e'/>
+      <parameter type-id='822cd80b'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-decl name='puts' visibility='default' binding='global' size-in-bits='64'>
      <parameter type-id='80f4b756'/>
      <return type-id='95e97e5e'/>
@ -8086,6 +8143,11 @@
      <parameter type-id='95e97e5e'/>
      <return type-id='48b5725f'/>
    </function-decl>
+    <function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='80f4b756'/>
+      <parameter type-id='80f4b756'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
    <function-decl name='strnlen' visibility='default' binding='global' size-in-bits='64'>
      <parameter type-id='80f4b756'/>
      <parameter type-id='b59d7dce'/>
@ -8285,12 +8347,12 @@
    <function-decl name='zfs_version_print' mangled-name='zfs_version_print' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_print'>
      <return type-id='95e97e5e'/>
    </function-decl>
-    <function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
-      <return type-id='95e97e5e'/>
-    </function-decl>
    <function-decl name='zfs_version_nvlist' mangled-name='zfs_version_nvlist' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_nvlist'>
      <return type-id='5ce45b60'/>
    </function-decl>
+    <function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-decl name='printf_color' mangled-name='printf_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='printf_color'>
      <parameter type-id='80f4b756' name='color'/>
      <parameter type-id='80f4b756' name='format'/>
@ -8795,11 +8857,6 @@
      <parameter type-id='78c01427'/>
      <return type-id='13956559'/>
    </function-decl>
-    <function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <parameter type-id='80f4b756'/>
-      <return type-id='b59d7dce'/>
-    </function-decl>
    <function-decl name='zfs_dirnamelen' mangled-name='zfs_dirnamelen' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_dirnamelen'>
      <parameter type-id='80f4b756' name='path'/>
      <return type-id='79a0948f'/>
@ -9131,8 +9188,8 @@
    </function-decl>
  </abi-instr>
  <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
-      <subrange length='41' type-id='7359adad' id='cb834f44'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
+      <subrange length='42' type-id='7359adad' id='cb7c937f'/>
    </array-type-def>
    <enum-decl name='zfeature_flags' id='6db816a4'>
      <underlying-type type-id='9cac1fee'/>
@ -9209,7 +9266,7 @@
    <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
    <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
    <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
    <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
    <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
      <parameter type-id='80f4b756'/>
@ -9781,6 +9838,50 @@
      <return type-id='c19b74c3'/>
    </function-decl>
  </abi-instr>
+  <abi-instr address-size='64' path='module/zcommon/zfs_valstr.c' language='LANG_C99'>
+    <function-decl name='zfs_valstr_zio_flag' mangled-name='zfs_valstr_zio_flag' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_flag_bits' mangled-name='zfs_valstr_zio_flag_bits' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag_bits'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_flag_pairs' mangled-name='zfs_valstr_zio_flag_pairs' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_flag_pairs'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_stage' mangled-name='zfs_valstr_zio_stage' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_stage_bits' mangled-name='zfs_valstr_zio_stage_bits' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage_bits'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_stage_pairs' mangled-name='zfs_valstr_zio_stage_pairs' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_stage_pairs'>
+      <parameter type-id='9c313c2d' name='bits'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+    <function-decl name='zfs_valstr_zio_priority' mangled-name='zfs_valstr_zio_priority' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_valstr_zio_priority'>
+      <parameter type-id='95e97e5e' name='v'/>
+      <parameter type-id='26a90f95' name='out'/>
+      <parameter type-id='b59d7dce' name='outlen'/>
+      <return type-id='b59d7dce'/>
+    </function-decl>
+  </abi-instr>
  <abi-instr address-size='64' path='module/zcommon/zpool_prop.c' language='LANG_C99'>
    <function-decl name='zpool_prop_string_to_index' mangled-name='zpool_prop_string_to_index' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_string_to_index'>
      <parameter type-id='5d0c23fb' name='prop'/>
--- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c
@ -3733,6 +3733,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
 			(void) zpool_standard_error(hdl, errno, errbuf);
 		}
 		break;
+
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "The new device cannot have a higher alignment requirement "
+		    "than the top-level vdev."));
+		(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
+		break;
 	default:
 		(void) zpool_standard_error(hdl, errno, errbuf);
 	}
@ -4303,22 +4310,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)

 /*
 * Change the GUID for a pool.
+ *
+ * Similar to zpool_reguid(), but may take a GUID.
+ *
+ * If the guid argument is NULL, then no GUID is passed in the nvlist to the
+ * ioctl().
 */
 int
-zpool_reguid(zpool_handle_t *zhp)
+zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
 {
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvl = NULL;
 	zfs_cmd_t zc = {"\0"};
+	int error = -1;
+
+	if (guid != NULL) {
+		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+			return (no_memory(hdl));
+
+		if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
+			nvlist_free(nvl);
+			return (no_memory(hdl));
+		}
+
+		zcmd_write_src_nvlist(hdl, &zc, nvl);
+	}

 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);

 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
-		return (0);
+	error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
+	if (error) {
+		return (zpool_standard_error(hdl, errno, errbuf));
+	}
+	if (guid != NULL) {
+		zcmd_free_nvlists(&zc);
+		nvlist_free(nvl);
+	}
+	return (0);
+}

-	return (zpool_standard_error(hdl, errno, errbuf));
+/*
+ * Change the GUID for a pool.
+ */
+int
+zpool_reguid(zpool_handle_t *zhp)
+{
+	return (zpool_set_guid(zhp, NULL));
 }

 /*
@ -5609,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,

 	return (ret);
 }
+
+/*
+ * Prune older entries from the DDT to reclaim space under the quota
+ */
+int
+zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit,
+    uint64_t amount)
+{
+	int error = lzc_ddt_prune(zhp->zpool_name, unit, amount);
+	if (error != 0) {
+		libzfs_handle_t *hdl = zhp->zpool_hdl;
+		char errbuf[ERRBUFLEN];
+
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot prune dedup table on '%s'"), zhp->zpool_name);
+
+		if (error == EALREADY) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "a prune operation is already in progress"));
+			(void) zfs_error(hdl, EZFS_BUSY, errbuf);
+		} else {
+			(void) zpool_standard_error(hdl, errno, errbuf);
+		}
+		return (-1);
+	}
+
+	return (0);
+}
--- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi
+++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi
@ -162,6 +162,7 @@
    <elf-symbol name='lzc_channel_program_nosync' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='lzc_clone' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='lzc_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='lzc_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='lzc_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='lzc_destroy_bookmarks' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
    <elf-symbol name='lzc_destroy_snaps' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -1444,6 +1445,7 @@
      <enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
      <enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
      <enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
+      <enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
      <enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
      <enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
      <enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@ -1484,6 +1486,13 @@
      <enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
    </enum-decl>
    <typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
+    <enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
+      <enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
+    </enum-decl>
+    <typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
    <enum-decl name='data_type_t' naming-typedef-id='8d0687d2' id='aeeae136'>
      <underlying-type type-id='9cac1fee'/>
      <enumerator name='DATA_TYPE_DONTCARE' value='-1'/>
@ -3015,6 +3024,12 @@
      <parameter type-id='857bb57e' name='outnvl'/>
      <return type-id='95e97e5e'/>
    </function-decl>
+    <function-decl name='lzc_ddt_prune' mangled-name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='lzc_ddt_prune'>
+      <parameter type-id='80f4b756' name='pool'/>
+      <parameter type-id='02e25ab0' name='unit'/>
+      <parameter type-id='9c313c2d' name='amount'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
    <function-type size-in-bits='64' id='c70fa2e8'>
      <parameter type-id='95e97e5e'/>
      <parameter type-id='eaa32e2f'/>
--- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c
+++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c
@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl)
 {
 	return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl));
 }
+
+/*
+ * Prune the specified amount from the pool's dedup table.
+ */
+int
+lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount)
+{
+	int error;
+
+	nvlist_t *result = NULL;
+	nvlist_t *args = fnvlist_alloc();
+
+	fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit);
+	fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount);
+
+	error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result);
+
+	fnvlist_free(args);
+	fnvlist_free(result);
+
+	return (error);
+}
--- a/sys/contrib/openzfs/lib/libzpool/Makefile.am
+++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am
@ -1,7 +1,9 @@
+include $(srcdir)/%D%/include/Makefile.am
+
 libzpool_la_CFLAGS  = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
 libzpool_la_CFLAGS += $(ZLIB_CFLAGS)

-libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
 libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD

@ -9,6 +11,7 @@ lib_LTLIBRARIES += libzpool.la
 CPPCHECKTARGETS += libzpool.la

 dist_libzpool_la_SOURCES = \
+	%D%/abd_os.c \
 	%D%/kernel.c \
 	%D%/taskq.c \
 	%D%/util.c
@ -39,7 +42,6 @@ nodist_libzpool_la_SOURCES = \
 	module/lua/lvm.c \
 	module/lua/lzio.c \
 	\
-	module/os/linux/zfs/abd_os.c \
 	module/os/linux/zfs/arc_os.c \
 	module/os/linux/zfs/trace.c \
 	module/os/linux/zfs/vdev_file.c \
@ -62,6 +64,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zcommon/zfs_fletcher_superscalar4.c \
 	module/zcommon/zfs_namecheck.c \
 	module/zcommon/zfs_prop.c \
+	module/zcommon/zfs_valstr.c \
 	module/zcommon/zpool_prop.c \
 	module/zcommon/zprop_common.c \
 	\
@ -79,6 +82,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/dbuf.c \
 	module/zfs/dbuf_stats.c \
 	module/zfs/ddt.c \
+	module/zfs/ddt_log.c \
 	module/zfs/ddt_stats.c \
 	module/zfs/ddt_zap.c \
 	module/zfs/dmu.c \
--- a/sys/contrib/openzfs/lib/libzpool/abd_os.c
+++ b/sys/contrib/openzfs/lib/libzpool/abd_os.c
@ -0,0 +1,365 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+/*
+ * We're simulating scatter/gather with 4K allocations, since that's more like
+ * what a typical kernel does.
+ */
+#define	ABD_PAGESIZE	(4096)
+#define	ABD_PAGESHIFT	(12)
+#define	ABD_PAGEMASK	(ABD_PAGESIZE-1)
+
+/*
+ * See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
+ * mostly useful to get a mix of linear and scatter ABDs for testing.
+ */
+#define	ABD_SCATTER_MIN_SIZE	(512 * 3)
+
+abd_t *abd_zero_scatter = NULL;
+
+static uint_t
+abd_iovcnt_for_bytes(size_t size)
+{
+	/*
+	 * Each iovec points to a 4K page. There's no real reason to do this
+	 * in userspace, but our whole point here is to make it feel a bit
+	 * more like a real paged memory model.
+	 */
+	return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+	/*
+	 * Zero-sized means it will be used for a linear or gang abd, so just
+	 * allocate the abd itself and return.
+	 */
+	if (size == 0)
+		return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
+
+	/*
+	 * Allocating for a scatter abd, so compute how many ABD_PAGESIZE
+	 * iovecs we will need to hold this size. Append that allocation to the
+	 * end. Note that struct abd_scatter has includes abd_iov[1], so we
+	 * allocate one less iovec than we need.
+	 *
+	 * Note we're not allocating the pages proper, just the iovec pointers.
+	 * That's down in abd_alloc_chunks. We _could_ do it here in a single
+	 * allocation, but it's fiddly and harder to read for no real gain.
+	 */
+	uint_t n = abd_iovcnt_for_bytes(size);
+	abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
+	    UMEM_NOFAIL);
+	ABD_SCATTER(abd).abd_offset = 0;
+	ABD_SCATTER(abd).abd_iovcnt = n;
+	return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+	/* For scatter, compute the extra amount we need to free */
+	uint_t iovcnt =
+	    abd_is_linear(abd) || abd_is_gang(abd) ?
+	    0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
+	umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	/*
+	 * We've already allocated the iovec array; ensure that the wanted size
+	 * actually matches, otherwise the caller has made a mistake somewhere.
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
+
+	/*
+	 * Allocate a ABD_PAGESIZE region for each iovec.
+	 */
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base =
+		    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+		iov[i].iov_len = ABD_PAGESIZE;
+	}
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++)
+		umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (size < ABD_SCATTER_MIN_SIZE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	(void) abd;
+	(void) op;
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+#ifdef ZFS_DEBUG
+	/*
+	 * scatter abds shall have:
+	 * - at least one iovec
+	 * - all iov_base point somewhere
+	 * - all iov_len are ABD_PAGESIZE
+	 * - offset set within the abd pages somewhere
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, >, 0);
+
+	uint_t len = 0;
+	for (int i = 0; i < n; i++) {
+		ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
+		ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
+		len += ABD_PAGESIZE;
+	}
+
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
+#endif
+}
+
+void
+abd_init(void)
+{
+	/*
+	 * Create the "zero" scatter abd. This is always the size of the
+	 * largest possible block, but only actually has a single allocated
+	 * page, which all iovecs in the abd point to.
+	 */
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+
+	void *zero =
+	    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+	memset(zero, 0, ABD_PAGESIZE);
+
+	uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base = zero;
+		iov[i].iov_len = ABD_PAGESIZE;
+	}
+}
+
+void
+abd_fini(void)
+{
+	umem_free_aligned(
+	    ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	/*
+	 * LINEAR_PAGE is specific to the Linux kernel; we never set this
+	 * flag, so this will never be called.
+	 */
+	(void) abd;
+	PANIC("unreachable");
+}
+
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
+{
+
+	/*
+	 * Create a new scatter dabd by borrowing data pages from sabd to cover
+	 * off+size.
+	 *
+	 * sabd is an existing scatter abd with a set of iovecs, each covering
+	 * an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
+	 *
+	 *   [........][........][........][........]
+	 *      ^- sabd_offset
+	 *
+	 * We want to produce a new abd, referencing those allocations at the
+	 * given offset.
+	 *
+	 *   [........][........][........][........]
+	 *                    ^- dabd_offset = sabd_offset + off
+	 *                                        ^- dabd_offset + size
+	 *
+	 * In this example, dabd needs three iovecs. The first iovec is offset
+	 * 0, so the final dabd_offset is masked back into the first iovec.
+	 *
+	 *             [........][........][........]
+	 *                    ^- dabd_offset
+	 */
+	size_t soff = ABD_SCATTER(sabd).abd_offset + off;
+	size_t doff = soff & ABD_PAGEMASK;
+	size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
+
+	/*
+	 * If the passed-in abd has enough allocated iovecs already, reuse it.
+	 * Otherwise, make a new one. The caller will free the original if the
+	 * one it gets back is not the same.
+	 *
+	 * Note that it's ok if we reuse an abd with more iovecs than we need.
+	 * abd_size has the usable amount of data, and the abd does not own the
+	 * pages referenced by the iovecs. At worst, they're holding dangling
+	 * pointers that we'll never use anyway.
+	 */
+	if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
+		dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
+
+	/* Set offset into first page in view */
+	ABD_SCATTER(dabd).abd_offset = doff;
+
+	/* Copy the wanted iovecs from the source to the dest */
+	memcpy(&ABD_SCATTER(dabd).abd_iov[0],
+	    &ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
+	    iovcnt * sizeof (struct iovec));
+
+	return (dabd);
+}
+
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
+	aiter->iter_abd = abd;
+}
+
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+}
+
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		aiter->iter_mapaddr =
+		    ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+		aiter->iter_mapsize =
+		    aiter->iter_abd->abd_size - aiter->iter_pos;
+		return;
+	}
+
+	/*
+	 * For scatter, we index into the appropriate iovec, and return the
+	 * smaller of the amount requested, or up to the end of the page.
+	 */
+	size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
+
+	ASSERT3U(poff >> ABD_PAGESHIFT, <=,
+	    ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
+	struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
+	    abd_iov[poff >> ABD_PAGESHIFT];
+
+	aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
+	    aiter->iter_abd->abd_size - aiter->iter_pos);
+	ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
+
+	aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
+}
+
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter))
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
--- a/sys/contrib/openzfs/lib/libzpool/include/Makefile.am
+++ b/sys/contrib/openzfs/lib/libzpool/include/Makefile.am
@ -0,0 +1,4 @@
+libzpooldir = $(includedir)/libzpool
+libzpool_HEADERS = \
+	%D%/sys/abd_os.h \
+	%D%/sys/abd_impl_os.h
--- a/sys/contrib/openzfs/lib/libzpool/include/sys/abd_impl_os.h
+++ b/sys/contrib/openzfs/lib/libzpool/include/sys/abd_impl_os.h
@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	((void)0)
+#define	abd_exit_critical(flags)	((void)0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
--- a/sys/contrib/openzfs/lib/libzpool/include/sys/abd_os.h
+++ b/sys/contrib/openzfs/lib/libzpool/include/sys/abd_os.h
@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_iovcnt;
+	struct iovec	abd_iov[1]; /* actually variable-length */
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
--- a/sys/contrib/openzfs/man/Makefile.am
+++ b/sys/contrib/openzfs/man/Makefile.am
@ -72,6 +72,7 @@ dist_man_MANS = \
 	%D%/man8/zpool-create.8 \
 	%D%/man8/zpool-destroy.8 \
 	%D%/man8/zpool-detach.8 \
+	%D%/man8/zpool-ddtprune.8 \
 	%D%/man8/zpool-events.8 \
 	%D%/man8/zpool-export.8 \
 	%D%/man8/zpool-get.8 \
--- a/sys/contrib/openzfs/man/man4/spl.4
+++ b/sys/contrib/openzfs/man/man4/spl.4
@ -175,17 +175,6 @@ Increasing this value will
 result in a slower thread creation rate which may be preferable for some
 configurations.
 .
-.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
-The maximum number of tasks per pending list in each taskq shown in
-.Pa /proc/spl/taskq{,-all} .
-Write
-.Sy 0
-to turn off the limit.
-The proc file will walk the lists with lock held,
-reading it could cause a lock-up if the list grow too large
-without limiting the output.
-"(truncated)" will be shown if the list is larger than the limit.
-.
 .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
 Minimum idle threads exit interval for dynamic taskqs.
 Smaller values allow idle threads exit more often and potentially be
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
 .
+.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
+Controls the number of copies stored for DeDup Table
+.Pq DDT
+objects.
+Reducing the number of copies to 1 from the previous default of 3
+can reduce the write inflation caused by deduplication.
+This assumes redundancy for this data is provided by the vdev layer.
+If the DDT is damaged, space may be leaked
+.Pq not freed
+when the DDT can not report the correct reference count.
+.
 .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Limit the amount we can prefetch with one call to this amount in bytes.
 This helps to limit the amount of memory that can be used by prefetching.
@ -121,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
-.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
+.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
-The default is off,
+The default is 0,
 meaning both MRU and MFU data and metadata are cached.
-When turning off this feature, some MRU buffers will still be present
-in ARC and eventually cached on L2ARC.
+When turning off this feature (setting it to 0), some MRU buffers will
+still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
+Setting it to 1 means to L2 cache only MFU data and metadata.
+.Pp
+Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
+only MFU data (ie: MRU data are not cached). This can be the right setting
+to cache as much metadata as possible even when having high data turnover.
+.Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
@ -821,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
+To reduce OOM risk, this limit is applied for kswapd reclaims only.
 .Pp
 The default limit of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
@ -974,6 +992,88 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
+.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
+Maximum number of dedup log flush passes (iterations) each transaction.
+.Pp
+At the start of each transaction, OpenZFS will estimate how many entries it
+needs to flush out to keep up with the change rate, taking the amount and time
+taken to flush on previous txgs into account (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+It will spread this amount into a number of passes.
+At each pass, it will use the amount already flushed and the total time taken
+by flushing and by other IO to recompute how much it should do for the remainder
+of the txg.
+.Pp
+Reducing the max number of passes will make flushing more aggressive, flushing
+out more entries on each pass.
+This can be faster, but also more likely to compete with other IO.
+Increasing the max number of passes will put fewer entries onto each pass,
+keeping the overhead of dedup changes to a minimum but possibly causing a large
+number of changes to be dumped on the last pass, which can blow out the txg
+sync time beyond
+.Sy zfs_txg_timeout .
+.
+.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
+Minimum time to spend on dedup log flush each transaction.
+.Pp
+At least this long will be spent flushing dedup log entries each transaction,
+up to
+.Sy zfs_txg_timeout .
+This occurs even if doing so would delay the transaction, that is, other IO
+completes under this time.
+.
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+Flush at least this many entries each transaction.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction to
+keep up with the ingest rate (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+This sets the minimum for that estimate.
+Raising it can force OpenZFS to flush more aggressively, keeping the log small
+and so reducing pool import times, but can make it less able to back off if
+log flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
+Number of transactions to use to compute the flow rate.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction by
+monitoring the number of entries changed (ingest rate), number of entries
+flushed (flush rate) and time spent flushing (flush time rate) and combining
+these into an overall "flow rate".
+It will use an exponential weighted moving average over some number of recent
+transactions to compute these rates.
+This sets the number of transactions to compute these averages over.
+Setting it higher can help to smooth out the flow rate in the face of spiky
+workloads, but will take longer for the flow rate to adjust to a sustained
+change in the ingress rate.
+.
+.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
+Max transactions to before starting to flush dedup logs.
+.Pp
+OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
+If there is nothing to flush, it will accumulate changes for no more than this
+many transactions before switching the logs and starting to flush entries out.
+.
+.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
+Max memory to use for dedup logs.
+.Pp
+OpenZFS will spend no more than this much memory on maintaining the in-memory
+dedup log.
+Flushing will begin when around half this amount is being spent on logs.
+The default value of
+.Sy 0
+will cause it to be set by
+.Sy zfs_dedup_log_mem_max_percent
+instead.
+.
+.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
+Max memory to use for dedup logs, as a percentage of total memory.
+.Pp
+If
+.Sy zfs_dedup_log_mem_max
+is not set, it will be initialised as a percentage of the total memory in the
+system.
+.
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
--- a/sys/contrib/openzfs/man/man7/zpool-features.7
+++ b/sys/contrib/openzfs/man/man7/zpool-features.7
@ -17,8 +17,9 @@
 .\" Copyright (c) 2019, Klara Inc.
 .\" Copyright (c) 2019, Allan Jude
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+.\" Copyright (c) 2023, Klara Inc.
 .\"
-.Dd June 23, 2022
+.Dd February 14, 2024
 .Dt ZPOOL-FEATURES 7
 .Os
 .
@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
 .Sy enabled
 state when all datasets that use this feature are destroyed.
 .
+.feature com.klarasystems fast_dedup yes
+This feature allows more advanced deduplication features to be enabled on new
+dedup tables.
+.Pp
+This feature will be
+.Sy active
+when the first deduplicated block is written after a new dedup table is created
+(ie after a new pool creation, or new checksum used on a dataset with
+.Sy dedup
+enabled).
+It will be returned to the
+.Sy enabled
+state when all deduplicated blocks using it are freed.
+.
 .feature com.delphix extensible_dataset no
 This feature allows more flexible use of internal ZFS data structures,
 and exists for other features to depend on.
--- a/sys/contrib/openzfs/man/man8/zpool-ddtprune.8
+++ b/sys/contrib/openzfs/man/man8/zpool-ddtprune.8
@ -0,0 +1,48 @@
+.\"
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\"
+.\"
+.\" Copyright (c) 2024, Klara Inc.
+.\"
+.Dd June 17, 2024
+.Dt ZPOOL-DDTPRUNE 8
+.Os
+.
+.Sh NAME
+.Nm zpool-ddtprune
+.Nd Prunes the oldest entries from the single reference dedup table(s)
+.Sh SYNOPSIS
+.Nm zpool
+.Cm ddtprune
+.Fl d Ar days | Fl p Ar percentage
+.Ar pool
+.Sh DESCRIPTION
+This command prunes older unique entries from the dedup table.
+As a complement to the dedup quota feature,
+.Sy ddtprune
+allows removal of older non-duplicate entries to make room for
+newer duplicate entries.
+.Pp
+The amount to prune can be based on a target percentage of the unique entries
+or based on the age (i.e., every unique entry older than N days).
+.
+.Sh SEE ALSO
+.Xr zdb 8 ,
+.Xr zpool-status 8
--- a/sys/contrib/openzfs/man/man8/zpool-reguid.8
+++ b/sys/contrib/openzfs/man/man8/zpool-reguid.8
@ -25,8 +25,10 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024, Klara Inc.
+.\" Copyright (c) 2024, Mateusz Piotrowski
 .\"
-.Dd May 31, 2021
+.Dd June 21, 2023
 .Dt ZPOOL-REGUID 8
 .Os
 .
@ -36,6 +38,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm reguid
+.Op Fl g Ar guid
 .Ar pool
 .
 .Sh DESCRIPTION
@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
 You must ensure that all devices in this pool are online and healthy before
 performing this action.
 .
+.Bl -tag -width Ds
+.It Fl g Ar guid
+Set the pool GUID to the provided value.
+The GUID can be any 64-bit value accepted by
+.Xr strtoull 3
+in base 10.
+.Nm
+will return an error if the provided GUID is already in use.
+.El
 .Sh SEE ALSO
 .Xr zpool-export 8 ,
 .Xr zpool-import 8
--- a/sys/contrib/openzfs/man/man8/zpool.8
+++ b/sys/contrib/openzfs/man/man8/zpool.8
@ -592,6 +592,7 @@ don't wait.
 .Xr zpool-checkpoint 8 ,
 .Xr zpool-clear 8 ,
 .Xr zpool-create 8 ,
+.Xr zpool-ddtprune 8 ,
 .Xr zpool-destroy 8 ,
 .Xr zpool-detach 8 ,
 .Xr zpool-events 8 ,
--- a/sys/contrib/openzfs/module/Kbuild.in
+++ b/sys/contrib/openzfs/module/Kbuild.in
@ -16,8 +16,8 @@ src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
-icp_include = $(srctree)/$(src)/icp/include
-zstd_include = $(srctree)/$(src)/zstd/include
+icp_include = $(src)/icp/include
+zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif

@ -240,6 +240,7 @@ ZCOMMON_OBJS := \
 	zfs_fletcher_superscalar4.o \
 	zfs_namecheck.o \
 	zfs_prop.o \
+	zfs_valstr.o \
 	zpool_prop.o \
 	zprop_common.o

@ -322,6 +323,7 @@ ZFS_OBJS := \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
+	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
--- a/sys/contrib/openzfs/module/Makefile.bsd
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@ -233,6 +233,7 @@ SRCS+=	cityhash.c \
 	zfs_fletcher_superscalar.c \
 	zfs_namecheck.c \
 	zfs_prop.c \
+	zfs_valstr.c \
 	zpool_prop.c \
 	zprop_common.c

@ -252,6 +253,7 @@ SRCS+=	abd.c \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@ -426,6 +428,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast

 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.ddt_zap.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@ -95,14 +95,12 @@ struct {
 */
 static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;

-#if defined(_KERNEL)
 SYSCTL_DECL(_vfs_zfs);

 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
 SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
-#endif

 kmem_cache_t *abd_chunk_cache;
 static kstat_t *abd_ksp;
@ -250,7 +248,7 @@ abd_alloc_zero_scatter(void)

 	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;

 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@ -124,7 +124,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");

 SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
 	"ZFS livelist condense");
-SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
 	"ZFS VDEV mirror");
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@ -868,16 +868,16 @@ spl_init(void)
 	if ((rc = spl_tsd_init()))
 		goto out2;

-	if ((rc = spl_taskq_init()))
+	if ((rc = spl_proc_init()))
 		goto out3;

-	if ((rc = spl_kmem_cache_init()))
+	if ((rc = spl_kstat_init()))
 		goto out4;

-	if ((rc = spl_proc_init()))
+	if ((rc = spl_taskq_init()))
 		goto out5;

-	if ((rc = spl_kstat_init()))
+	if ((rc = spl_kmem_cache_init()))
 		goto out6;

 	if ((rc = spl_zlib_init()))
@ -891,13 +891,13 @@ spl_init(void)
 out8:
 	spl_zlib_fini();
 out7:
-	spl_kstat_fini();
-out6:
-	spl_proc_fini();
-out5:
 	spl_kmem_cache_fini();
-out4:
+out6:
 	spl_taskq_fini();
+out5:
+	spl_kstat_fini();
+out4:
+	spl_proc_fini();
 out3:
 	spl_tsd_fini();
 out2:
@ -913,10 +913,10 @@ spl_fini(void)
 {
 	spl_zone_fini();
 	spl_zlib_fini();
-	spl_kstat_fini();
-	spl_proc_fini();
 	spl_kmem_cache_fini();
 	spl_taskq_fini();
+	spl_kstat_fini();
+	spl_proc_fini();
 	spl_tsd_fini();
 	spl_kvmem_fini();
 	spl_random_fini();
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@ -31,7 +31,6 @@
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
-#include <sys/taskq.h>
 #include <sys/proc.h>
 #include <linux/ctype.h>
 #include <linux/kmod.h>
@ -63,8 +62,6 @@ static struct ctl_table_header *spl_kstat = NULL;
 static struct proc_dir_entry *proc_spl = NULL;
 static struct proc_dir_entry *proc_spl_kmem = NULL;
 static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
-static struct proc_dir_entry *proc_spl_taskq_all = NULL;
-static struct proc_dir_entry *proc_spl_taskq = NULL;
 struct proc_dir_entry *proc_spl_kstat = NULL;

 #ifdef DEBUG_KMEM
@ -177,195 +174,6 @@ proc_dohostid(CONST_CTL_TABLE *table, int write,
 	return (0);
 }

-static void
-taskq_seq_show_headers(struct seq_file *f)
-{
-	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
-	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
-	    "mina", "maxa", "cura", "flags");
-}
-
-/* indices into the lheads array below */
-#define	LHEAD_PEND	0
-#define	LHEAD_PRIO	1
-#define	LHEAD_DELAY	2
-#define	LHEAD_WAIT	3
-#define	LHEAD_ACTIVE	4
-#define	LHEAD_SIZE	5
-
-static unsigned int spl_max_show_tasks = 512;
-/* CSTYLED */
-module_param(spl_max_show_tasks, uint, 0644);
-MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
-
-static int
-taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
-{
-	taskq_t *tq = p;
-	taskq_thread_t *tqt = NULL;
-	spl_wait_queue_entry_t *wq;
-	struct task_struct *tsk;
-	taskq_ent_t *tqe;
-	char name[100];
-	struct list_head *lheads[LHEAD_SIZE], *lh;
-	static char *list_names[LHEAD_SIZE] =
-	    {"pend", "prio", "delay", "wait", "active" };
-	int i, j, have_lheads = 0;
-	unsigned long wflags, flags;
-
-	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
-	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
-
-	/* get the various lists and check whether they're empty */
-	lheads[LHEAD_PEND] = &tq->tq_pend_list;
-	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
-	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
-#else
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
-#endif
-	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
-
-	for (i = 0; i < LHEAD_SIZE; ++i) {
-		if (list_empty(lheads[i]))
-			lheads[i] = NULL;
-		else
-			++have_lheads;
-	}
-
-	/* early return in non-"all" mode if lists are all empty */
-	if (!allflag && !have_lheads) {
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-		spin_unlock_irqrestore(&tq->tq_lock, flags);
-		return (0);
-	}
-
-	/* unlock the waitq quickly */
-	if (!lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-
-	/* show the base taskq contents */
-	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
-	seq_printf(f, "%-25s ", name);
-	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
-	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
-	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
-	    tq->tq_nalloc, tq->tq_flags);
-
-	/* show the active list */
-	if (lheads[LHEAD_ACTIVE]) {
-		j = 0;
-		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
-			if (j == 0)
-				seq_printf(f, "\t%s:",
-				    list_names[LHEAD_ACTIVE]);
-			else if (j == 2) {
-				seq_printf(f, "\n\t       ");
-				j = 0;
-			}
-			seq_printf(f, " [%d]%pf(%ps)",
-			    tqt->tqt_thread->pid,
-			    tqt->tqt_task->tqent_func,
-			    tqt->tqt_task->tqent_arg);
-			++j;
-		}
-		seq_printf(f, "\n");
-	}
-
-	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
-		if (lheads[i]) {
-			j = 0;
-			list_for_each(lh, lheads[i]) {
-				if (spl_max_show_tasks != 0 &&
-				    j >= spl_max_show_tasks) {
-					seq_printf(f, "\n\t(truncated)");
-					break;
-				}
-				/* show the wait waitq list */
-				if (i == LHEAD_WAIT) {
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, entry);
-#else
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, task_list);
-#endif
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 8 == 0)
-						seq_printf(f, "\n\t     ");
-
-					tsk = wq->private;
-					seq_printf(f, " %d", tsk->pid);
-				/* pend, prio and delay lists */
-				} else {
-					tqe = list_entry(lh, taskq_ent_t,
-					    tqent_list);
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 2 == 0)
-						seq_printf(f, "\n\t     ");
-
-					seq_printf(f, " %pf(%ps)",
-					    tqe->tqent_func,
-					    tqe->tqent_arg);
-				}
-				++j;
-			}
-			seq_printf(f, "\n");
-		}
-	if (lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-	spin_unlock_irqrestore(&tq->tq_lock, flags);
-
-	return (0);
-}
-
-static int
-taskq_all_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_TRUE));
-}
-
-static int
-taskq_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_FALSE));
-}
-
-static void *
-taskq_seq_start(struct seq_file *f, loff_t *pos)
-{
-	struct list_head *p;
-	loff_t n = *pos;
-
-	down_read(&tq_list_sem);
-	if (!n)
-		taskq_seq_show_headers(f);
-
-	p = tq_list.next;
-	while (n--) {
-		p = p->next;
-		if (p == &tq_list)
-		return (NULL);
-	}
-
-	return (list_entry(p, taskq_t, tq_taskqs));
-}
-
-static void *
-taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
-{
-	taskq_t *tq = p;
-
-	++*pos;
-	return ((tq->tq_taskqs.next == &tq_list) ?
-	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
-}
-
 static void
 slab_seq_show_headers(struct seq_file *f)
 {
@ -501,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
 #endif
 };

-static void
-taskq_seq_stop(struct seq_file *f, void *v)
-{
-	up_read(&tq_list_sem);
-}
-
-static const struct seq_operations taskq_all_seq_ops = {
-	.show	= taskq_all_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static const struct seq_operations taskq_seq_ops = {
-	.show	= taskq_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static int
-proc_taskq_all_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_all_seq_ops));
-}
-
-static int
-proc_taskq_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_seq_ops));
-}
-
-static const kstat_proc_op_t proc_taskq_all_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_all_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_all_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
-static const kstat_proc_op_t proc_taskq_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
 static struct ctl_table spl_kmem_table[] = {
 #ifdef DEBUG_KMEM
 	{
@ -677,8 +425,6 @@ static void spl_proc_cleanup(void)
 	remove_proc_entry("kstat", proc_spl);
 	remove_proc_entry("slab", proc_spl_kmem);
 	remove_proc_entry("kmem", proc_spl);
-	remove_proc_entry("taskq-all", proc_spl);
-	remove_proc_entry("taskq", proc_spl);
 	remove_proc_entry("spl", NULL);

 #ifndef HAVE_REGISTER_SYSCTL_TABLE
@ -761,20 +507,6 @@ spl_proc_init(void)
 		goto out;
 	}

-	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
-	    &proc_taskq_all_operations, NULL);
-	if (proc_spl_taskq_all == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
-	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
-	    &proc_taskq_operations, NULL);
-	if (proc_spl_taskq == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
 	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
 	if (proc_spl_kmem == NULL) {
 		rc = -EUNATCH;
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@ -22,16 +22,98 @@
 *
 *  Solaris Porting Layer (SPL) Task Queue Implementation.
 */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */

 #include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/kstat.h>
 #ifdef HAVE_CPU_HOTPLUG
 #include <linux/cpuhotplug.h>
 #endif

+typedef struct taskq_kstats {
+	/* static values, for completeness */
+	kstat_named_t tqks_threads_max;
+	kstat_named_t tqks_entry_pool_min;
+	kstat_named_t tqks_entry_pool_max;
+
+	/* gauges (inc/dec counters, current value) */
+	kstat_named_t tqks_threads_active;
+	kstat_named_t tqks_threads_idle;
+	kstat_named_t tqks_threads_total;
+	kstat_named_t tqks_tasks_pending;
+	kstat_named_t tqks_tasks_priority;
+	kstat_named_t tqks_tasks_total;
+	kstat_named_t tqks_tasks_delayed;
+	kstat_named_t tqks_entries_free;
+
+	/* counters (inc only, since taskq creation) */
+	kstat_named_t tqks_threads_created;
+	kstat_named_t tqks_threads_destroyed;
+	kstat_named_t tqks_tasks_dispatched;
+	kstat_named_t tqks_tasks_dispatched_delayed;
+	kstat_named_t tqks_tasks_executed_normal;
+	kstat_named_t tqks_tasks_executed_priority;
+	kstat_named_t tqks_tasks_executed;
+	kstat_named_t tqks_tasks_delayed_requeued;
+	kstat_named_t tqks_tasks_cancelled;
+	kstat_named_t tqks_thread_wakeups;
+	kstat_named_t tqks_thread_wakeups_nowork;
+	kstat_named_t tqks_thread_sleeps;
+} taskq_kstats_t;
+
+static taskq_kstats_t taskq_kstats_template = {
+	{ "threads_max",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_min",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_max",		KSTAT_DATA_UINT64 },
+	{ "threads_active",		KSTAT_DATA_UINT64 },
+	{ "threads_idle",		KSTAT_DATA_UINT64 },
+	{ "threads_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_pending",		KSTAT_DATA_UINT64 },
+	{ "tasks_priority",		KSTAT_DATA_UINT64 },
+	{ "tasks_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed",		KSTAT_DATA_UINT64 },
+	{ "entries_free",		KSTAT_DATA_UINT64 },
+
+	{ "threads_created",		KSTAT_DATA_UINT64 },
+	{ "threads_destroyed",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched_delayed",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_normal",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_priority",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed_requeued",	KSTAT_DATA_UINT64 },
+	{ "tasks_cancelled",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups_nowork",	KSTAT_DATA_UINT64 },
+	{ "thread_sleeps",		KSTAT_DATA_UINT64 },
+};
+
+#define	TQSTAT_INC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, 1)
+#define	TQSTAT_DEC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, -1)
+
+#define	_TQSTAT_MOD_LIST(mod, tq, t) do { \
+	switch (t->tqent_flags & TQENT_LIST_MASK) {			\
+	case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
+	case TQENT_LIST_PENDING: mod(tq, tasks_pending); break;		\
+	case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break;	\
+	case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break;		\
+	}								\
+} while (0)
+#define	TQSTAT_INC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
+#define	TQSTAT_DEC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
+
+#define	TQENT_SET_LIST(t, l)	\
+	t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
+
 static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
@ -134,6 +216,7 @@ task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
 		ASSERT(!timer_pending(&t->tqent_timer));

 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC(tq, entries_free);
 		return (t);
 	}

@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
+	ASSERT(list_empty(&t->tqent_list));

 	/* Wake tasks blocked in taskq_wait_id() */
 	wake_up_all(&t->tqent_waitq);

-	list_del_init(&t->tqent_list);
-
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_id = TASKQID_INVALID;
 		t->tqent_func = NULL;
@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 		t->tqent_flags = 0;

 		list_add_tail(&t->tqent_list, &tq->tq_free_list);
+		TQSTAT_INC(tq, entries_free);
 	} else {
 		task_free(tq, t);
 	}
@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
 	spin_unlock_irqrestore(&tq->tq_lock, flags);

 	wake_up(&tq->tq_work_waitq);
+
+	TQSTAT_INC(tq, tasks_delayed_requeued);
 }

 static void
@ -534,7 +619,11 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	t = taskq_find(tq, id);
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC_LIST(tq, t);
+		TQSTAT_DEC(tq, tasks_total);
+
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
+		TQSTAT_INC(tq, tasks_cancelled);

 		/*
 		 * When canceling the lowest outstanding task id we
@ -604,13 +693,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	spin_lock(&t->tqent_lock);

 	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
-	if (flags & TQ_NOQUEUE)
+	if (flags & TQ_NOQUEUE) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 	/* Queue to the priority list instead of the pending list */
-	else if (flags & TQ_FRONT)
+	} else if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);

 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@ -629,6 +724,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)

 	wake_up(&tq->tq_work_waitq);

+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@ -662,6 +759,9 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,

 	/* Queue to the delay list for subsequent execution */
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);

 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@ -676,6 +776,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,

 	spin_unlock(&t->tqent_lock);

+	TQSTAT_INC(tq, tasks_dispatched_delayed);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@ -724,10 +826,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;

 	/* Queue to the priority list instead of the pending list */
-	if (flags & TQ_FRONT)
+	if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);

 	t->tqent_id = tq->tq_next_id;
 	tq->tq_next_id++;
@ -742,6 +849,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,

 	wake_up(&tq->tq_work_waitq);

+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@ -908,6 +1017,8 @@ taskq_thread(void *args)
 	wake_up(&tq->tq_wait_waitq);
 	set_current_state(TASK_INTERRUPTIBLE);

+	TQSTAT_INC(tq, threads_total);
+
 	while (!kthread_should_stop()) {

 		if (list_empty(&tq->tq_pend_list) &&
@ -919,9 +1030,15 @@ taskq_thread(void *args)
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);

+			TQSTAT_INC(tq, thread_sleeps);
+			TQSTAT_INC(tq, threads_idle);
+
 			schedule();
 			seq_tasks = 0;

+			TQSTAT_DEC(tq, threads_idle);
+			TQSTAT_INC(tq, thread_wakeups);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 			remove_wait_queue(&tq->tq_work_waitq, &wait);
@ -931,6 +1048,8 @@ taskq_thread(void *args)

 		if ((t = taskq_next_ent(tq)) != NULL) {
 			list_del_init(&t->tqent_list);
+			TQSTAT_DEC_LIST(tq, t);
+			TQSTAT_DEC(tq, tasks_total);

 			/*
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
@ -955,6 +1074,7 @@ taskq_thread(void *args)
 			tq->tq_nactive++;
 			spin_unlock_irqrestore(&tq->tq_lock, flags);

+			TQSTAT_INC(tq, threads_active);
 			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);

 			/* Perform the requested task */
@ -962,8 +1082,17 @@ taskq_thread(void *args)

 			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);

+			TQSTAT_DEC(tq, threads_active);
+			if ((t->tqent_flags & TQENT_LIST_MASK) ==
+			    TQENT_LIST_PENDING)
+				TQSTAT_INC(tq, tasks_executed_normal);
+			else
+				TQSTAT_INC(tq, tasks_executed_priority);
+			TQSTAT_INC(tq, tasks_executed);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
+
 			tq->tq_nactive--;
 			list_del_init(&tqt->tqt_active_list);
 			tqt->tqt_task = NULL;
@ -989,7 +1118,8 @@ taskq_thread(void *args)
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
-		}
+		} else
+			TQSTAT_INC(tq, thread_wakeups_nowork);

 		set_current_state(TASK_INTERRUPTIBLE);

@ -998,6 +1128,10 @@ taskq_thread(void *args)
 	__set_current_state(TASK_RUNNING);
 	tq->tq_nthreads--;
 	list_del_init(&tqt->tqt_thread_list);
+
+	TQSTAT_DEC(tq, threads_total);
+	TQSTAT_INC(tq, threads_destroyed);
+
 error:
 	kmem_free(tqt, sizeof (taskq_thread_t));
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
@ -1037,9 +1171,156 @@ taskq_thread_create(taskq_t *tq)

 	wake_up_process(tqt->tqt_thread);

+	TQSTAT_INC(tq, threads_created);
+
 	return (tqt);
 }

+static void
+taskq_stats_init(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_init(&tqs->tqs_threads_active, 0);
+	wmsum_init(&tqs->tqs_threads_idle, 0);
+	wmsum_init(&tqs->tqs_threads_total, 0);
+	wmsum_init(&tqs->tqs_tasks_pending, 0);
+	wmsum_init(&tqs->tqs_tasks_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_total, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed, 0);
+	wmsum_init(&tqs->tqs_entries_free, 0);
+	wmsum_init(&tqs->tqs_threads_created, 0);
+	wmsum_init(&tqs->tqs_threads_destroyed, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_executed, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
+	wmsum_init(&tqs->tqs_tasks_cancelled, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
+	wmsum_init(&tqs->tqs_thread_sleeps, 0);
+}
+
+static void
+taskq_stats_fini(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_fini(&tqs->tqs_threads_active);
+	wmsum_fini(&tqs->tqs_threads_idle);
+	wmsum_fini(&tqs->tqs_threads_total);
+	wmsum_fini(&tqs->tqs_tasks_pending);
+	wmsum_fini(&tqs->tqs_tasks_priority);
+	wmsum_fini(&tqs->tqs_tasks_total);
+	wmsum_fini(&tqs->tqs_tasks_delayed);
+	wmsum_fini(&tqs->tqs_entries_free);
+	wmsum_fini(&tqs->tqs_threads_created);
+	wmsum_fini(&tqs->tqs_threads_destroyed);
+	wmsum_fini(&tqs->tqs_tasks_dispatched);
+	wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
+	wmsum_fini(&tqs->tqs_tasks_executed_normal);
+	wmsum_fini(&tqs->tqs_tasks_executed_priority);
+	wmsum_fini(&tqs->tqs_tasks_executed);
+	wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
+	wmsum_fini(&tqs->tqs_tasks_cancelled);
+	wmsum_fini(&tqs->tqs_thread_wakeups);
+	wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
+	wmsum_fini(&tqs->tqs_thread_sleeps);
+}
+
+static int
+taskq_kstats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	taskq_t *tq = ksp->ks_private;
+	taskq_kstats_t *tqks = ksp->ks_data;
+
+	tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
+	tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
+	tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
+
+	taskq_sums_t *tqs = &tq->tq_sums;
+
+	tqks->tqks_threads_active.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_active);
+	tqks->tqks_threads_idle.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_idle);
+	tqks->tqks_threads_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_total);
+	tqks->tqks_tasks_pending.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_pending);
+	tqks->tqks_tasks_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_priority);
+	tqks->tqks_tasks_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_total);
+	tqks->tqks_tasks_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed);
+	tqks->tqks_entries_free.value.ui64 =
+	    wmsum_value(&tqs->tqs_entries_free);
+	tqks->tqks_threads_created.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_created);
+	tqks->tqks_threads_destroyed.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_destroyed);
+	tqks->tqks_tasks_dispatched.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched);
+	tqks->tqks_tasks_dispatched_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
+	tqks->tqks_tasks_executed_normal.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_normal);
+	tqks->tqks_tasks_executed_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_priority);
+	tqks->tqks_tasks_executed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed);
+	tqks->tqks_tasks_delayed_requeued.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed_requeued);
+	tqks->tqks_tasks_cancelled.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_cancelled);
+	tqks->tqks_thread_wakeups.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups);
+	tqks->tqks_thread_wakeups_nowork.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups_nowork);
+	tqks->tqks_thread_sleeps.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_sleeps);
+
+	return (0);
+}
+
+static void
+taskq_kstats_init(taskq_t *tq)
+{
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
+
+	kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
+	    KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_private = tq;
+	ksp->ks_update = taskq_kstats_update;
+	ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
+	memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
+	kstat_install(ksp);
+
+	tq->tq_ksp = ksp;
+}
+
+static void
+taskq_kstats_fini(taskq_t *tq)
+{
+	if (tq->tq_ksp == NULL)
+		return;
+
+	kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
+	kstat_delete(tq->tq_ksp);
+
+	tq->tq_ksp = NULL;
+}
+
 taskq_t *
 taskq_create(const char *name, int threads_arg, pri_t pri,
    int minalloc, int maxalloc, uint_t flags)
@ -1104,6 +1385,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	init_waitqueue_head(&tq->tq_wait_waitq);
 	tq->tq_lock_class = TQ_LOCK_GENERAL;
 	INIT_LIST_HEAD(&tq->tq_taskqs);
+	taskq_stats_init(tq);

 	if (flags & TASKQ_PREPOPULATE) {
 		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
@ -1137,14 +1419,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,

 	if (rc) {
 		taskq_destroy(tq);
-		tq = NULL;
-	} else {
-		down_write(&tq_list_sem);
-		tq->tq_instance = taskq_find_by_name(name) + 1;
-		list_add_tail(&tq->tq_taskqs, &tq_list);
-		up_write(&tq_list_sem);
+		return (NULL);
 	}

+	down_write(&tq_list_sem);
+	tq->tq_instance = taskq_find_by_name(name) + 1;
+	list_add_tail(&tq->tq_taskqs, &tq_list);
+	up_write(&tq_list_sem);
+
+	/* Install kstats late, because the name includes tq_instance */
+	taskq_kstats_init(tq);
+
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create);
@ -1177,6 +1462,8 @@ taskq_destroy(taskq_t *tq)

 	taskq_wait(tq);

+	taskq_kstats_fini(tq);
+
 	/* remove taskq from global list used by the kstats */
 	down_write(&tq_list_sem);
 	list_del(&tq->tq_taskqs);
@ -1230,6 +1517,7 @@ taskq_destroy(taskq_t *tq)

 	spin_unlock_irqrestore(&tq->tq_lock, flags);

+	taskq_stats_fini(tq);
 	kmem_strfree(tq->tq_name);
 	kmem_free(tq, sizeof (taskq_t));
 }
@ -1271,6 +1559,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
 }
 EXPORT_SYMBOL(taskq_create_synced);

+static kstat_t *taskq_summary_ksp = NULL;
+
+static int
+spl_taskq_kstat_headers(char *buf, size_t size)
+{
+	size_t n = snprintf(buf, size,
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n",
+	    "", "threads", "tasks on queue",
+	    "taskq name", "tot [act idl] max", " pend [ norm  high] dly",
+	    "--------------------", "-----------------",
+	    "-----------------------");
+	return (n >= size ? ENOMEM : 0);
+}
+
+static int
+spl_taskq_kstat_data(char *buf, size_t size, void *data)
+{
+	struct list_head *tql = NULL;
+	taskq_t *tq;
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	char threads[25];
+	char tasks[30];
+	size_t n;
+	int err = 0;
+
+	down_read(&tq_list_sem);
+	list_for_each_prev(tql, &tq_list) {
+		tq = list_entry(tql, taskq_t, tq_taskqs);
+
+		mutex_enter(tq->tq_ksp->ks_lock);
+		taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
+		taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
+
+		snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
+		    tq->tq_instance);
+		snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
+		    tqks->tqks_threads_total.value.ui64,
+		    tqks->tqks_threads_active.value.ui64,
+		    tqks->tqks_threads_idle.value.ui64,
+		    tqks->tqks_threads_max.value.ui64);
+		snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
+		    tqks->tqks_tasks_total.value.ui64,
+		    tqks->tqks_tasks_pending.value.ui64,
+		    tqks->tqks_tasks_priority.value.ui64,
+		    tqks->tqks_tasks_delayed.value.ui64);
+
+		mutex_exit(tq->tq_ksp->ks_lock);
+
+		n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
+		    name, threads, tasks);
+		if (n >= size) {
+			err = ENOMEM;
+			break;
+		}
+
+		buf = &buf[n];
+		size -= n;
+	}
+
+	up_read(&tq_list_sem);
+
+	return (err);
+}
+
+static void
+spl_taskq_kstat_init(void)
+{
+	kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_data = (void *)(uintptr_t)1;
+	ksp->ks_ndata = 1;
+	kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
+	    spl_taskq_kstat_data, NULL);
+	kstat_install(ksp);
+
+	taskq_summary_ksp = ksp;
+}
+
+static void
+spl_taskq_kstat_fini(void)
+{
+	if (taskq_summary_ksp == NULL)
+		return;
+
+	kstat_delete(taskq_summary_ksp);
+	taskq_summary_ksp = NULL;
+}
+
 static unsigned int spl_taskq_kick = 0;

 /*
@ -1451,12 +1833,16 @@ spl_taskq_init(void)
 	 */
 	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;

+	spl_taskq_kstat_init();
+
 	return (0);
 }

 void
 spl_taskq_fini(void)
 {
+	spl_taskq_kstat_fini();
+
 	taskq_destroy(dynamic_taskq);
 	dynamic_taskq = NULL;

--- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@ -186,6 +186,13 @@ issig(void)

 		schedule();
 #endif
+		/*
+		 * Dequeued SIGSTOP/SIGTSTP.
+		 * Check if process has other singal pending.
+		 */
+		if (signal_pending(current))
+			return (1);
+
 		return (0);
 	}

--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@ -58,22 +58,16 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
-#ifdef _KERNEL
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #include <linux/version.h>
-#endif

-#ifdef _KERNEL
 #if defined(MAX_ORDER)
 #define	ABD_MAX_ORDER	(MAX_ORDER)
 #elif defined(MAX_PAGE_ORDER)
 #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
 #endif
-#else
-#define	ABD_MAX_ORDER	(1)
-#endif

 typedef struct abd_stats {
 	kstat_named_t abdstat_struct_size;
@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;

 struct page;
 /*
- * _KERNEL   - Will point to ZERO_PAGE if it is available or it will be
- *             an allocated zero'd PAGESIZE buffer.
- * Userspace - Will be an allocated zero'ed PAGESIZE buffer.
- *
- * abd_zero_page is assigned to each of the pages of abd_zero_scatter.
+ * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
+ * point to ZERO_PAGE if it is available or it will be an allocated zero'd
+ * PAGESIZE buffer.
 */
 static struct page *abd_zero_page = NULL;

@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 }

-#ifdef _KERNEL
 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;

 /*
@ -509,7 +500,7 @@ abd_alloc_zero_scatter(void)
 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;

 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 }

-#else /* _KERNEL */
-
-#ifndef PAGE_SHIFT
-#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
-#endif
-
-#define	zfs_kmap_local(chunk)		((void *)chunk)
-#define	zfs_kunmap_local(addr)		do { (void)(addr); } while (0)
-#define	local_irq_save(flags)		do { (void)(flags); } while (0)
-#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
-#define	nth_page(pg, i) \
-	((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
-	struct page *page;
-	int length;
-	int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
-	memset(sg, 0, nr * sizeof (struct scatterlist));
-	sg[nr - 1].end = 1;
-}
-
-/*
- * This must be called if any of the sg_table allocation functions
- * are called.
- */
-static void
-abd_free_sg_table(abd_t *abd)
-{
-	int nents = ABD_SCATTER(abd).abd_nents;
-	vmem_free(ABD_SCATTER(abd).abd_sgl,
-	    nents * sizeof (struct scatterlist));
-}
-
-#define	for_each_sg(sgl, sg, nr, i)	\
-	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
-    unsigned int offset)
-{
-	/* currently we don't use offset */
-	ASSERT(offset == 0);
-	sg->page = page;
-	sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
-	return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
-	if (sg->end)
-		return (NULL);
-
-	return (sg + 1);
-}
-
-void
-abd_alloc_chunks(abd_t *abd, size_t size)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
-	struct scatterlist *sg;
-	int i;
-
-	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-		sg_set_page(sg, p, PAGESIZE, 0);
-	}
-	ABD_SCATTER(abd).abd_nents = nr_pages;
-}
-
-void
-abd_free_chunks(abd_t *abd)
-{
-	int i, n = ABD_SCATTER(abd).abd_nents;
-	struct scatterlist *sg;
-
-	abd_for_each_sg(abd, sg, n, i) {
-		struct page *p = nth_page(sg_page(sg), 0);
-		umem_free_aligned(p, PAGESIZE);
-	}
-	abd_free_sg_table(abd);
-}
-
-static void
-abd_alloc_zero_scatter(void)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	struct scatterlist *sg;
-	int i;
-
-	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-	memset(abd_zero_page, 0, PAGESIZE);
-	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
-	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
-	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-
-	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
-		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
-	}
-
-	ABDSTAT_BUMP(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
-	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
-}
-
-#endif /* _KERNEL */
-
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
 	abd_free_struct(abd_zero_scatter);
 	abd_zero_scatter = NULL;
 	ASSERT3P(abd_zero_page, !=, NULL);
-#if defined(_KERNEL)
 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 	abd_unmark_zfs_page(abd_zero_page);
 	__free_page(abd_zero_page);
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
-#else
-	umem_free_aligned(abd_zero_page, PAGESIZE);
-#endif /* _KERNEL */
 }

 static int
@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
 {
 }

-#if defined(_KERNEL)
-
 /*
 * This is abd_iter_page(), the function underneath abd_iterate_page_func().
 * It yields the next page struct and data offset and size within it, without
@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-
-#endif /* _KERNEL */
--- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 	 * See also the comment above zfs_arc_shrinker_limit.
 	 */
 	int64_t can_free = btop(arc_evictable_memory());
-	int64_t limit = zfs_arc_shrinker_limit != 0 ?
-	    zfs_arc_shrinker_limit : INT64_MAX;
-	return (MIN(can_free, limit));
+	if (current_is_kswapd() && zfs_arc_shrinker_limit)
+		can_free = MIN(can_free, zfs_arc_shrinker_limit);
+	return (can_free);
 }

 static unsigned long
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path;
-	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
-	    NULL };
+	char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
+	    NULL, NULL, NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
-	argv[5] = full_name;
-	argv[6] = full_path;
+	argv[6] = full_name;
+	argv[7] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@ -292,6 +292,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 {
 	struct super_block *s;
 	objset_t *os;
+	boolean_t issnap = B_FALSE;
 	int err;

 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
@ -323,6 +324,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 		if (zpl_enter(zfsvfs, FTAG) == 0) {
 			if (os != zfsvfs->z_os)
 				err = -SET_ERROR(EBUSY);
+			issnap = zfsvfs->z_issnap;
 			zpl_exit(zfsvfs, FTAG);
 		} else {
 			err = -SET_ERROR(EBUSY);
@ -346,7 +348,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 			return (ERR_PTR(err));
 		}
 		s->s_flags |= SB_ACTIVE;
-	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
+		/*
+		 * Skip ro check for snap since snap is always ro regardless
+		 * ro flag is passed by mount or not.
+		 */
 		deactivate_locked_super(s);
 		return (ERR_PTR(-EBUSY));
 	}
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@ -1213,6 +1213,7 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->io_opt = limits->zql_io_opt;
 	qlimits->physical_block_size = limits->zql_physical_block_size;
 	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
 	qlimits->discard_granularity = limits->zql_discard_granularity;
 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	qlimits->features =
@ -1251,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)

 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
@ -1261,13 +1261,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 		return (1);
 	}

-#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
-#endif
-
 	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
+
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
@ -1361,7 +1358,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 * request queue and generic disk structures for the block device.
 */
 static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
+zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
@ -1381,6 +1378,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;

 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1670,7 +1668,8 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;

-	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
@ -1680,7 +1679,6 @@ zvol_os_create_minor(const char *name)
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;

-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;

--- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c
+++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
@ -754,6 +754,12 @@ zpool_feature_init(void)
 	    "Support for raidz expansion",
 	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);

+	zfeature_register(SPA_FEATURE_FAST_DEDUP,
+	    "com.klarasystems:fast_dedup", "fast_dedup",
+	    "Support for advanced deduplication",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+	    sfeatures);
+
 	zfs_mod_list_supported_free(sfeatures);
 }

--- a/sys/contrib/openzfs/module/zcommon/zfs_valstr.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c
@ -0,0 +1,277 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2024, Klara Inc.
+ */
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/string.h>
+#include <sys/debug.h>
+#include "zfs_valstr.h"
+
+/*
+ * Each bit in a bitfield has three possible string representations:
+ * - single char
+ * - two-char pair
+ * - full name
+ */
+typedef struct {
+	const char	vb_bit;
+	const char	vb_pair[2];
+	const char	*vb_name;
+} valstr_bit_t;
+
+/*
+ * Emits a character for each bit in `bits`, up to the number of elements
+ * in the table. Set bits get the character in vb_bit, clear bits get a
+ * space. This results in all strings having the same width, for easier
+ * visual comparison.
+ */
+static size_t
+valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
+    uint64_t bits, char *out, size_t outlen)
+{
+	ASSERT(out);
+	size_t n = 0;
+	for (int b = 0; b < nelems; b++) {
+		if (n == outlen)
+			break;
+		uint64_t mask = (1ULL << b);
+		out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
+	}
+	if (n < outlen)
+		out[n++] = '\0';
+	return (n);
+}
+
+/*
+ * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
+ * separated by a `|` character. This gives a concise representation of the
+ * whole value.
+ */
+static size_t
+valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
+    uint64_t bits, char *out, size_t outlen)
+{
+	ASSERT(out);
+	size_t n = 0;
+	for (int b = 0; b < nelems; b++) {
+		ASSERT3U(n, <=, outlen);
+		if (n == outlen)
+			break;
+		uint64_t mask = (1ULL << b);
+		if (bits & mask) {
+			size_t len = (n > 0) ? 3 : 2;
+			if (n > outlen-len)
+				break;
+			if (n > 0)
+				out[n++] = '|';
+			out[n++] = table[b].vb_pair[0];
+			out[n++] = table[b].vb_pair[1];
+		}
+	}
+	if (n < outlen)
+		out[n++] = '\0';
+	return (n);
+}
+
+/*
+ * Emits the full name for each bit set in `bits`, taken from vb_name, and
+ * separated by a space. This unambiguously shows the entire set of bits, but
+ * can get very long.
+ */
+static size_t
+valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
+    uint64_t bits, char *out, size_t outlen)
+{
+	ASSERT(out);
+	size_t n = 0;
+	for (int b = 0; b < nelems; b++) {
+		ASSERT3U(n, <=, outlen);
+		if (n == outlen)
+			break;
+		uint64_t mask = (1ULL << b);
+		if (bits & mask) {
+			size_t len = strlen(table[b].vb_name);
+			if (n > 0)
+				len++;
+			if (n > outlen-len)
+				break;
+			if (n > 0) {
+				out[n++] = ' ';
+				len--;
+			}
+			memcpy(&out[n], table[b].vb_name, len);
+			n += len;
+		}
+	}
+	if (n < outlen)
+		out[n++] = '\0';
+	return (n);
+}
+
+/*
+ * Emits the name of the given enum value in the table.
+ */
+static size_t
+valstr_enum_str(const char **table, const size_t nelems,
+    int v, char *out, size_t outlen)
+{
+	ASSERT(out);
+	ASSERT3U(v, <, nelems);
+	if (v >= nelems)
+		return (0);
+	return (MIN(strlcpy(out, table[v], outlen), outlen));
+}
+
+/*
+ * These macros create the string tables for the given name, and implement
+ * the public functions described in zfs_valstr.h.
+ */
+#define	_VALSTR_BITFIELD_IMPL(name, ...)				\
+static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
+size_t									\
+zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen)	\
+{									\
+	return (valstr_bitfield_bits(valstr_ ## name ## _table,		\
+	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
+}									\
+									\
+size_t									\
+zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen)	\
+{									\
+	return (valstr_bitfield_pairs(valstr_ ## name ## _table,	\
+	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
+}									\
+									\
+size_t									\
+zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen)		\
+{									\
+	return (valstr_bitfield_str(valstr_ ## name ## _table,		\
+	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
+}									\
+
+#define	_VALSTR_ENUM_IMPL(name, ...)					\
+static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ };	\
+size_t									\
+zfs_valstr_ ## name(int v, char *out, size_t outlen)			\
+{									\
+	return (valstr_enum_str(valstr_ ## name ## _table,		\
+	    ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen));	\
+}									\
+
+
+/* String tables */
+
+/* ZIO flags: zio_flag_t, typically zio->io_flags */
+/* BEGIN CSTYLED */
+_VALSTR_BITFIELD_IMPL(zio_flag,
+	{ '.', "DA", "DONT_AGGREGATE" },
+	{ '.', "RP", "IO_REPAIR" },
+	{ '.', "SH", "SELF_HEAL" },
+	{ '.', "RS", "RESILVER" },
+	{ '.', "SC", "SCRUB" },
+	{ '.', "ST", "SCAN_THREAD" },
+	{ '.', "PH", "PHYSICAL" },
+	{ '.', "CF", "CANFAIL" },
+	{ '.', "SP", "SPECULATIVE" },
+	{ '.', "CW", "CONFIG_WRITER" },
+	{ '.', "DR", "DONT_RETRY" },
+	{ '?', "??", "[UNUSED 11]" },
+	{ '.', "ND", "NODATA" },
+	{ '.', "ID", "INDUCE_DAMAGE" },
+	{ '.', "AL", "IO_ALLOCATING" },
+	{ '.', "RE", "IO_RETRY" },
+	{ '.', "PR", "PROBE" },
+	{ '.', "TH", "TRYHARD" },
+	{ '.', "OP", "OPTIONAL" },
+	{ '.', "DQ", "DONT_QUEUE" },
+	{ '.', "DP", "DONT_PROPAGATE" },
+	{ '.', "BY", "IO_BYPASS" },
+	{ '.', "RW", "IO_REWRITE" },
+	{ '.', "CM", "RAW_COMPRESS" },
+	{ '.', "EN", "RAW_ENCRYPT" },
+	{ '.', "GG", "GANG_CHILD" },
+	{ '.', "DD", "DDT_CHILD" },
+	{ '.', "GF", "GODFATHER" },
+	{ '.', "NP", "NOPWRITE" },
+	{ '.', "EX", "REEXECUTED" },
+	{ '.', "DG", "DELEGATED" },
+)
+/* END CSTYLED */
+
+/*
+ * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
+ *                        zio->io_pipeline.
+ */
+/* BEGIN CSTYLED */
+_VALSTR_BITFIELD_IMPL(zio_stage,
+	{ 'O', "O ", "OPEN" },
+	{ 'I', "RI", "READ_BP_INIT" },
+	{ 'I', "WI", "WRITE_BP_INIT" },
+	{ 'I', "FI", "FREE_BP_INIT" },
+	{ 'A', "IA", "ISSUE_ASYNC" },
+	{ 'W', "WC", "WRITE_COMPRESS" },
+	{ 'E', "EN", "ENCRYPT" },
+	{ 'C', "CG", "CHECKSUM_GENERATE" },
+	{ 'N', "NW", "NOP_WRITE" },
+	{ 'B', "BF", "BRT_FREE" },
+	{ 'd', "dS", "DDT_READ_START" },
+	{ 'd', "dD", "DDT_READ_DONE" },
+	{ 'd', "dW", "DDT_WRITE" },
+	{ 'd', "dF", "DDT_FREE" },
+	{ 'G', "GA", "GANG_ASSEMBLE" },
+	{ 'G', "GI", "GANG_ISSUE" },
+	{ 'D', "DT", "DVA_THROTTLE" },
+	{ 'D', "DA", "DVA_ALLOCATE" },
+	{ 'D', "DF", "DVA_FREE" },
+	{ 'D', "DC", "DVA_CLAIM" },
+	{ 'R', "R ", "READY" },
+	{ 'V', "VS", "VDEV_IO_START" },
+	{ 'V', "VD", "VDEV_IO_DONE" },
+	{ 'V', "VA", "VDEV_IO_ASSESS" },
+	{ 'C', "CV", "CHECKSUM_VERIFY" },
+	{ 'X', "X ", "DONE" },
+)
+/* END CSTYLED */
+
+/* ZIO priority: zio_priority_t, typically zio->io_priority */
+/* BEGIN CSTYLED */
+_VALSTR_ENUM_IMPL(zio_priority,
+	"SYNC_READ",
+	"SYNC_WRITE",
+	"ASYNC_READ",
+	"ASYNC_WRITE",
+	"SCRUB",
+	"REMOVAL",
+	"INITIALIZING",
+	"TRIM",
+	"REBUILD",
+	"[NUM_QUEUEABLE]",
+	"NOW",
+)
+/* END CSTYLED */
+
+#undef _VALSTR_BITFIELD_IMPL
+#undef _VALSTR_ENUM_IMPL
--- a/sys/contrib/openzfs/module/zfs/abd.c
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
 	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
-	    ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
+	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
 }

 /*
- * Allocate a linear ABD structure for buf.
+ * Create a linear ABD for an existing buf.
 */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
+static abd_t *
+abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
 {
-	abd_t *abd = abd_alloc_struct(0);
-
 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);

 	/*
@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
 	return (abd);
 }

+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+	abd_t *abd = abd_alloc_struct(0);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
+abd_t *
+abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
+{
+	abd_init_struct(abd);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
 /*
 * Get the raw buffer associated with a linear ABD.
 */
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
-	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
+	boolean_t free_abd = B_FALSE;

 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3P(abd, !=, NULL);

 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
-
+		abd = NULL;
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
-		ASSERT3P(tmpbuf, !=, NULL);
+		    hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
+		ASSERT3P(abd, !=, NULL);
 		ASSERT3U(csize, <=, psize);
-		abd = abd_get_from_buf(tmpbuf, lsize);
-		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
+		free_abd = B_TRUE;
 	}

 	/*
@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)

 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
-	else if (ret != ENOENT)
-		goto error;
+	else if (ret == ENOENT)
+		ret = 0;

-	if (tmpbuf != NULL)
-		abd_free(abd);
-
-	return (0);
-
-error:
-	if (tmpbuf != NULL)
+	if (free_abd)
 		abd_free(abd);

 	return (ret);
@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
-	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);

@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
-		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));

 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}

-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, buf->b_data,
+			    HDR_GET_LSIZE(hdr));
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-			    hdr->b_l1hdr.b_pabd, buf->b_data,
+			    hdr->b_l1hdr.b_pabd, &dabd,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
+			abd_free(&dabd);

 			/*
 			 * Absent hardware errors or software bugs, this should
@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
-		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));

 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}

-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	}

 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		size_t bufsize = MAX(size, asize);
-		void *buf = zio_buf_alloc(bufsize);
-		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		cabd = abd_alloc_for_io(MAX(size, asize), ismd);
+		uint64_t csize = zio_compress_data(compress, to_write, &cabd,
 		    size, hdr->b_complevel);
 		if (csize > psize) {
 			/*
@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
-			zio_buf_free(buf, bufsize);
+			abd_free(cabd);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
-			memset((char *)buf + csize, 0, asize - csize);
-		to_write = cabd = abd_get_from_buf(buf, bufsize);
-		abd_take_ownership_of_buf(cabd, B_TRUE);
+			abd_zero_off(cabd, csize, asize - csize);
+		to_write = cabd;
 	}

 	if (HDR_ENCRYPTED(hdr)) {
@ -9158,12 +9146,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
-		 * If pass == 1 or 3, we cache MRU metadata and data
-		 * respectively.
+		 * pass == 0: MFU meta
+		 * pass == 1: MRU meta
+		 * pass == 2: MFU data
+		 * pass == 3: MRU data
 		 */
-		if (l2arc_mfuonly) {
+		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
+		} else if (l2arc_mfuonly > 1) {
+			if (pass == 3)
+				continue;
 		}

 		uint64_t passed_sz = 0;
@ -10179,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
-	abd_t		*abd = NULL;
 	uint64_t	asize;

 	ASSERT(this_lbp != NULL && next_lbp != NULL);
@ -10241,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
-	case ZIO_COMPRESS_LZ4:
-		abd = abd_alloc_for_io(asize, B_TRUE);
+	case ZIO_COMPRESS_LZ4: {
+		abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
-		if ((err = zio_decompress_data(
+		abd_t dabd;
+		abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
+		err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
-		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
+		    abd, &dabd, asize, sizeof (*this_lb), NULL);
+		abd_free(&dabd);
+		abd_free(abd);
+		if (err != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
+	}
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
@ -10267,8 +10265,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
-	if (abd != NULL)
-		abd_free(abd);
 	return (err);
 }

@ -10504,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
-	uint8_t			*tmpbuf = NULL;
+	abd_t			*abd = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;

 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
@ -10527,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)

 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
-	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
+	    abd_buf->abd, &abd, sizeof (*lb), 0);

 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
@ -10553,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
-		memset(tmpbuf + psize, 0, asize - psize);
+		abd_zero_off(abd, psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
-		memcpy(tmpbuf, lb, sizeof (*lb));
+		abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}

 	/* checksum what we're about to write */
-	fletcher_4_native(tmpbuf, asize, NULL,
+	abd_fletcher_4_native(abd, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);

 	abd_free(abd_buf->abd);

 	/* perform the write itself */
-	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
-	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+	abd_buf->abd = abd;
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
--- a/sys/contrib/openzfs/module/zfs/blkptr.c
+++ b/sys/contrib/openzfs/module/zfs/blkptr.c
@ -142,8 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 		decode_embedded_bp_compressed(bp, dstbuf);
-		VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
-		    dstbuf, buf, psize, buflen, NULL));
+		abd_t cabd, dabd;
+		abd_get_from_buf_struct(&cabd, dstbuf, psize);
+		abd_get_from_buf_struct(&dabd, buf, buflen);
+		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
+		    &dabd, psize, buflen, NULL));
+		abd_free(&dabd);
+		abd_free(&cabd);
 	} else {
 		ASSERT3U(lsize, ==, psize);
 		decode_embedded_bp_compressed(bp, buf);
--- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@ -204,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 void
 dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 {
+	if (dk->dk_kstats == NULL)
+		return;
+
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	char *ds_name;

--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
--- a/sys/contrib/openzfs/module/zfs/ddt_log.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_log.c
@ -0,0 +1,778 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	ddt_lightweight_entry_t ddlwe;
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
+	if (!ddle)
+		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
+	if (!ddle)
+		return (B_FALSE);
+	if (ddlwe)
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time, or if something has requested the logs be
+	 * flushed ASAP (see ddt_walk_init()).
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	const boolean_t force =
+	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
+
+	if (!(too_large || too_old || force))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
--- a/sys/contrib/openzfs/module/zfs/ddt_stats.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_stats.c
@ -33,27 +33,32 @@
 #include <sys/ddt_impl.h>

 static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    ddt_stat_t *dds)
 {
 	spa_t *spa = ddt->ddt_spa;
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_key_t *ddk = &dde->dde_key;
-	uint64_t lsize = DDK_GET_LSIZE(ddk);
-	uint64_t psize = DDK_GET_PSIZE(ddk);
+	uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
+	uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);

 	memset(dds, 0, sizeof (*dds));

-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		uint64_t dsize = 0;
-		uint64_t refcnt = ddp->ddp_refcnt;
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

-		if (ddp->ddp_phys_birth == 0)
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;

-		int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
-		    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+		int ndvas = ddt_phys_dva_count(ddp, v,
+		    DDK_GET_CRYPT(&ddlwe->ddlwe_key));
+		const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+		    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
+
+		uint64_t dsize = 0;
 		for (int d = 0; d < ndvas; d++)
-			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+			dsize += dva_get_dsize_sync(spa, &dvas[d]);
+
+		uint64_t refcnt = ddt_phys_refcnt(ddp, v);

 		dds->dds_blocks += 1;
 		dds->dds_lsize += lsize;
@ -67,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 	}
 }

-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+static void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
 {
-	const uint64_t *s = (const uint64_t *)src;
-	uint64_t *d = (uint64_t *)dst;
-	uint64_t *d_end = (uint64_t *)(dst + 1);
+	dst->dds_blocks		+= src->dds_blocks;
+	dst->dds_lsize		+= src->dds_lsize;
+	dst->dds_psize		+= src->dds_psize;
+	dst->dds_dsize		+= src->dds_dsize;
+	dst->dds_ref_blocks	+= src->dds_ref_blocks;
+	dst->dds_ref_lsize	+= src->dds_ref_lsize;
+	dst->dds_ref_psize	+= src->dds_ref_psize;
+	dst->dds_ref_dsize	+= src->dds_ref_dsize;
+}

-	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+static void
+ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
+{
+	/* This caught more during development than you might expect... */
+	ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
+	ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
+	ASSERT3U(dst->dds_psize, >=, src->dds_psize);
+	ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
+	ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
+	ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
+	ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
+	ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);

-	for (int i = 0; i < d_end - d; i++)
-		d[i] += (s[i] ^ neg) - neg;
+	dst->dds_blocks		-= src->dds_blocks;
+	dst->dds_lsize		-= src->dds_lsize;
+	dst->dds_psize		-= src->dds_psize;
+	dst->dds_dsize		-= src->dds_dsize;
+	dst->dds_ref_blocks	-= src->dds_ref_blocks;
+	dst->dds_ref_lsize	-= src->dds_ref_lsize;
+	dst->dds_ref_psize	-= src->dds_ref_psize;
+	dst->dds_ref_dsize	-= src->dds_ref_dsize;
 }

 void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
 {
 	ddt_stat_t dds;
-	ddt_histogram_t *ddh;
 	int bucket;

-	ddt_stat_generate(ddt, dde, &dds);
+	ddt_stat_generate(ddt, ddlwe, &dds);

 	bucket = highbit64(dds.dds_ref_blocks) - 1;
-	ASSERT3U(bucket, >=, 0);
+	if (bucket < 0)
+		return;

-	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
+}

-	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+void
+ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_stat_t dds;
+	int bucket;
+
+	ddt_stat_generate(ddt, ddlwe, &dds);
+
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
+	if (bucket < 0)
+		return;
+
+	ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
 }

 void
 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 {
 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
 }

 void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 {
 	memset(dds, 0, sizeof (*dds));

 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+		ddt_stat_add(dds, &ddh->ddh_stat[h]);
 }

 boolean_t
 ddt_histogram_empty(const ddt_histogram_t *ddh)
 {
-	const uint64_t *s = (const uint64_t *)ddh;
-	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+	for (int h = 0; h < 64; h++) {
+		const ddt_stat_t *dds = &ddh->ddh_stat[h];

-	while (s < s_end)
-		if (*s++ != 0)
-			return (B_FALSE);
+		if (dds->dds_blocks == 0 &&
+		    dds->dds_lsize == 0 &&
+		    dds->dds_psize == 0 &&
+		    dds->dds_dsize == 0 &&
+		    dds->dds_ref_blocks == 0 &&
+		    dds->dds_ref_lsize == 0 &&
+		    dds->dds_ref_psize == 0 &&
+		    dds->dds_ref_dsize == 0)
+			continue;
+
+		return (B_FALSE);
+	}

 	return (B_TRUE);
 }
@ -170,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}

 	/*
@ -207,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }

@ -217,7 +276,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)

 	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 	ddt_get_dedup_histogram(spa, ddh_total);
-	ddt_histogram_stat(dds_total, ddh_total);
+	ddt_histogram_total(dds_total, ddh_total);
 	kmem_free(ddh_total, sizeof (ddt_histogram_t));
 }

--- a/sys/contrib/openzfs/module/zfs/ddt_zap.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c
@ -22,6 +22,7 @@
 /*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2023, Klara Inc.
 */

 #include <sys/zfs_context.h>
@ -51,8 +52,13 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)

 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */

-	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
-	    ci->ci_level);
+	/* Call compress function directly to avoid hole detection. */
+	abd_t sabd, dabd;
+	abd_get_from_buf_struct(&sabd, (void *)src, s_len);
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
+	abd_free(&dabd);
+	abd_free(&sabd);

 	if (c_len == s_len) {
 		cpfunc = ZIO_COMPRESS_OFF;
@ -71,12 +77,18 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 {
 	uchar_t version = *src++;
 	int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];

-	if (ci->ci_decompress != NULL)
-		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
-	else
+	if (zio_compress_table[cpfunc].ci_decompress == NULL) {
 		memcpy(dst, src, d_len);
+		return;
+	}
+
+	abd_t sabd, dabd;
+	abd_get_from_buf_struct(&sabd, src, s_len);
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
+	abd_free(&dabd);
+	abd_free(&sabd);

 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
 	    (ZFS_HOST_BYTEORDER != 0))
@ -108,7 +120,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)

 static int
 ddt_zap_lookup(objset_t *os, uint64_t object,
-    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
+    const ddt_key_t *ddk, void *phys, size_t psize)
 {
 	uchar_t *cbuf;
 	uint64_t one, csize;
@ -155,7 +167,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)

 static int
 ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
-    const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
+    const void *phys, size_t psize, dmu_tx_t *tx)
 {
 	const size_t cbuf_size = psize + 1;

@ -181,7 +193,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,

 static int
 ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
-    ddt_phys_t *phys, size_t psize)
+    void *phys, size_t psize)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif

+/*
+ * Override copies= for dedup state objects. 0 means the traditional behaviour
+ * (ie the default for the containing objset ie 3 for the MOS).
+ */
+uint_t dmu_ddt_copies = 0;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
+
+		if (dmu_ddt_copies > 0) {
+			/*
+			 * If this tuneable is set, and this is a write for a
+			 * dedup entry store (zap or log), then we treat it
+			 * something like ZFS_REDUNDANT_METADATA_MOST on a
+			 * regular dataset: this many copies, and one more for
+			 * "higher" indirect blocks. This specific exception is
+			 * necessary because dedup objects are stored in the
+			 * MOS, which always has the highest possible copies.
+			 */
+			dmu_object_type_t stype =
+			    dn ? dn->dn_storage_type : DMU_OT_NONE;
+			if (stype == DMU_OT_NONE)
+				stype = type;
+			if (stype == DMU_OT_DDT_ZAP) {
+				copies = dmu_ddt_copies;
+				if (level >=
+				    zfs_redundant_metadata_most_ditto_level)
+					copies++;
+			}
+		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);

@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
+	"Override copies= for dedup objects");
--- a/sys/contrib/openzfs/module/zfs/dmu_recv.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
-		    abd, abd_to_buf(dabd), abd_get_size(abd),
+		    abd, dabd, abd_get_size(abd),
 		    abd_get_size(dabd), NULL);

 		if (err != 0) {
@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
-		void *buf = abd_to_buf(cabd);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
-		    abd, &buf, abd_get_size(abd),
+		    abd, &cabd, abd_get_size(abd),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)

 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
-				    abd, abd_to_buf(decomp_abd),
+				    abd, decomp_abd,
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);

--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@ -2425,8 +2425,14 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	fnvlist_free(token_nv);
 	compressed = kmem_alloc(packed_size, KM_SLEEP);

-	compressed_size = gzip_compress(packed, compressed,
+	/* Call compress function directly to avoid hole detection. */
+	abd_t pabd, cabd;
+	abd_get_from_buf_struct(&pabd, packed, packed_size);
+	abd_get_from_buf_struct(&cabd, compressed, packed_size);
+	compressed_size = zfs_gzip_compress(&pabd, &cabd,
 	    packed_size, packed_size, 6);
+	abd_free(&cabd);
+	abd_free(&pabd);

 	zio_cksum_t cksum;
 	fletcher_4_native_varsize(compressed, compressed_size, &cksum);
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		zap_cursor_fini(&zc);
 	}

+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);

@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)

 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));

+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);

 	spa_history_log_internal(spa, "scan setup", tx,
@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
-	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
+	    !ddt_walk_ready(scn->scn_dp->dp_spa)) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
@ -2929,11 +2934,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)

 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
-	const ddt_key_t *ddk = &dde->dde_key;
-	ddt_phys_t *ddp = dde->dde_phys;
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };

@ -2954,11 +2958,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;

-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
+
+		if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
-		ddt_bp_create(checksum, ddk, ddp, &bp);
+		ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);

 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@ -3002,11 +3008,11 @@ static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
-	ddt_entry_t dde = {{{{0}}}};
+	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	uint64_t n = 0;

-	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
 		ddt_t *ddt;

 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
@ -3021,16 +3027,28 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);

-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
 		n++;

 		if (dsl_scan_check_suspend(scn, NULL))
 			break;
 	}

-	zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
-	    "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
-	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+	if (error == EAGAIN) {
+		dsl_scan_check_suspend(scn, NULL);
+		error = 0;
+
+		zfs_dbgmsg("waiting for ddt to become ready for scan "
+		    "on %s with class_max = %u; suspending=%u",
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);
+	} else
+		zfs_dbgmsg("scanned %llu ddt entries on %s with "
+		    "class_max = %u; suspending=%u", (longlong_t)n,
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);

 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
--- a/sys/contrib/openzfs/module/zfs/gzip.c
+++ b/sys/contrib/openzfs/module/zfs/gzip.c
@ -47,8 +47,9 @@ typedef uLongf zlen_t;

 #endif

-size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static size_t
+zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	int ret;
 	zlen_t dstlen = d_len;
@ -82,8 +83,9 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	return ((size_t)dstlen);
 }

-int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+static int
+zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	zlen_t dstlen = d_len;
@ -103,3 +105,6 @@ gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)

 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)
--- a/sys/contrib/openzfs/module/zfs/lz4_zfs.c
+++ b/sys/contrib/openzfs/module/zfs/lz4_zfs.c
@ -53,8 +53,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
 static void *lz4_alloc(int flags);
 static void lz4_free(void *ctx);

-size_t
-lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
    size_t d_len, int n)
 {
 	(void) n;
@ -81,8 +81,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
 	return (bufsiz + sizeof (bufsiz));
 }

-int
-lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
    size_t d_len, int n)
 {
 	(void) n;
@ -101,6 +101,9 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
 	    d_start, bufsiz, d_len) < 0);
 }

+ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
+
 /*
 * LZ4 API Description:
 *
--- a/Show More
+++ b/Show More