diff --git a/cddl/lib/libicp/Makefile b/cddl/lib/libicp/Makefile index b8bd3dc6df95..f097e7e6ff58 100644 --- a/cddl/lib/libicp/Makefile +++ b/cddl/lib/libicp/Makefile @@ -103,6 +103,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -DHAVE_ISSETUGID CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h diff --git a/cddl/lib/libicp_rescue/Makefile b/cddl/lib/libicp_rescue/Makefile index 3b332f736bde..3a8b6746fe61 100644 --- a/cddl/lib/libicp_rescue/Makefile +++ b/cddl/lib/libicp_rescue/Makefile @@ -100,6 +100,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -DHAVE_ISSETUGID -UHAVE_AVX -DRESCUE CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index 0b31fcb04b07..bb0127a9108a 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -63,10 +63,10 @@ KERNEL_C = \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ + zfs_valstr.c \ zpool_prop.c \ zprop_common.c - ARCH_C = .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" ARCH_C += zfs_fletcher_intel.c \ @@ -92,6 +92,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile index 95038f4e5638..b80486484506 100644 --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -1,5 +1,7 @@ ZFSTOP= ${SRCTOP}/sys/contrib/openzfs +.PATH: ${ZFSTOP}/lib/libzpool + # ZFS_COMMON_SRCS .PATH: ${ZFSTOP}/module/zfs .PATH: ${ZFSTOP}/module/zcommon @@ -14,8 +16,6 @@ ZFSTOP= ${SRCTOP}/sys/contrib/openzfs .PATH: ${ZFSTOP}/module/os/linux/zfs -.PATH: ${ZFSTOP}/lib/libzpool - .if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S) .PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH} ATOMIC_SRCS= opensolaris_atomic.S @@ -34,6 +34,7 @@ PACKAGE= zfs LIB= zpool USER_C = \ + abd_os.c \ kernel.c \ taskq.c \ util.c @@ -51,7 +52,6 @@ KERNEL_C = \ zpool_prop.c \ zprop_common.c \ abd.c \ - abd_os.c \ aggsum.c \ arc.c \ arc_os.c \ @@ -67,6 +67,7 @@ KERNEL_C = \ dbuf.c \ dbuf_stats.c \ ddt.c \ + ddt_log.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ @@ -255,6 +256,7 @@ CFLAGS+= \ -I${ZFSTOP}/include \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \ -I${SRCTOP}/sys \ -I${ZFSTOP}/include/os/freebsd/zfs \ -I${SRCTOP}/cddl/compat/opensolaris/include \ diff --git a/cddl/sbin/zpool/Makefile b/cddl/sbin/zpool/Makefile index aaa2c51016ad..ab7b852b4d9a 100644 --- a/cddl/sbin/zpool/Makefile +++ b/cddl/sbin/zpool/Makefile @@ -22,6 +22,7 @@ MAN= \ zpool-create.8 \ zpool-destroy.8 \ zpool-detach.8 \ + zpool-ddtprune.8 \ zpool-events.8 \ zpool-export.8 \ zpool-features.7 \ @@ -66,6 +67,7 @@ CFLAGS+= \ -I${ZFSTOP}/include \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \ -I${SRCTOP}/sys \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${ZFSTOP}/cmd/zpool \ diff --git a/cddl/usr.bin/zinject/Makefile b/cddl/usr.bin/zinject/Makefile index 395ba395a84b..921d4df7a651 100644 --- a/cddl/usr.bin/zinject/Makefile +++ b/cddl/usr.bin/zinject/Makefile @@ -15,6 +15,7 @@ CFLAGS+= \ -I${ZFSTOP}/include \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \ -I${SRCTOP}/sys \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${ZFSTOP}/module/icp/include \ diff --git a/cddl/usr.bin/zstream/Makefile b/cddl/usr.bin/zstream/Makefile index 96a46cdf1cf9..e08cadb31a7d 100644 --- a/cddl/usr.bin/zstream/Makefile +++ b/cddl/usr.bin/zstream/Makefile @@ -21,9 +21,11 @@ SYMLINKS= ${BINDIR}/zstream ${BINDIR}/zstreamdump WARNS?= 2 CFLAGS+= \ -DIN_BASE \ + -DZFS_DEBUG \ -I${ZFSTOP}/include \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \ -I${SRCTOP}/sys \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${ZFSTOP}/module/icp/include \ diff --git a/cddl/usr.bin/ztest/Makefile b/cddl/usr.bin/ztest/Makefile index dec4eb9ab398..ef4bd561b41a 100644 --- a/cddl/usr.bin/ztest/Makefile +++ b/cddl/usr.bin/ztest/Makefile @@ -15,6 +15,7 @@ CFLAGS+= \ -I${ZFSTOP}/include \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${ZFSTOP}/module/icp/include \ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ diff --git a/cddl/usr.sbin/zdb/Makefile b/cddl/usr.sbin/zdb/Makefile index 31013a1d8100..7d1e41e1690e 100644 --- a/cddl/usr.sbin/zdb/Makefile +++ b/cddl/usr.sbin/zdb/Makefile @@ -18,6 +18,7 @@ CFLAGS+= \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ -I${ZFSTOP}/lib/libspl/include/os/freebsd/spl \ + -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include \ -I${SRCTOP}/sys \ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ -DHAVE_ISSETUGID diff --git a/cddl/usr.sbin/zfsd/Makefile.common b/cddl/usr.sbin/zfsd/Makefile.common index 72bda687c2da..487caf54a0ce 100644 --- a/cddl/usr.sbin/zfsd/Makefile.common +++ b/cddl/usr.sbin/zfsd/Makefile.common @@ -17,6 +17,7 @@ CFLAGS+= -DIN_BASE CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -I${SRCTOP}/cddl/usr.sbin diff --git a/cddl/usr.sbin/zhack/Makefile b/cddl/usr.sbin/zhack/Makefile index a7c08fa818d5..c4fc25dcfb8f 100644 --- a/cddl/usr.sbin/zhack/Makefile +++ b/cddl/usr.sbin/zhack/Makefile @@ -12,6 +12,7 @@ CFLAGS+= -DIN_BASE CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include diff --git a/lib/libbe/Makefile b/lib/libbe/Makefile index 10bcabea7324..b04becc38d74 100644 --- a/lib/libbe/Makefile +++ b/lib/libbe/Makefile @@ -57,6 +57,7 @@ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h diff --git a/sys/cddl/boot/zfs/zfssubr.c b/sys/cddl/boot/zfs/zfssubr.c index 14018926897c..8de463e0fb33 100644 --- a/sys/cddl/boot/zfs/zfssubr.c +++ b/sys/cddl/boot/zfs/zfssubr.c @@ -107,7 +107,7 @@ typedef struct zio_checksum_info { #include "skein_zfs.c" #ifdef HAS_ZSTD_ZFS -extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, +extern int zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, int n); #endif @@ -191,7 +191,7 @@ static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {NULL, zle_decompress, 64, "zle"}, {NULL, lz4_decompress, 0, "lz4"}, #ifdef HAS_ZSTD_ZFS - {NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} + {NULL, zfs_zstd_decompress_buf, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} #endif }; diff --git a/sys/conf/files b/sys/conf/files index 6445f00e2801..220e0e47ec73 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -238,6 +238,7 @@ contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c optional zfs compile- contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zcommon/zfs_namecheck.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zcommon/zfs_prop.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_valstr.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zcommon/zpool_prop.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zcommon/zprop_common.c optional zfs compile-with "${ZFS_C}" @@ -270,6 +271,7 @@ contrib/openzfs/module/zfs/dbuf.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/dbuf_stats.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/dataset_kstats.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/ddt.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/ddt_log.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/ddt_stats.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/dmu.c optional zfs compile-with "${ZFS_C}" diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 7aac80c541ba..76ca22cbae00 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.9 +Linux-Maximum: 6.10 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am index 2bd9d039f20e..96040976e53e 100644 --- a/sys/contrib/openzfs/cmd/Makefile.am +++ b/sys/contrib/openzfs/cmd/Makefile.am @@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \ libzfs.la -zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) +zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) sbin_PROGRAMS += zhack CPPCHECKTARGETS += zhack @@ -39,7 +39,7 @@ zhack_LDADD = \ ztest_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) -ztest_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) +ztest_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) sbin_PROGRAMS += ztest CPPCHECKTARGETS += ztest diff --git a/sys/contrib/openzfs/cmd/mount_zfs.c b/sys/contrib/openzfs/cmd/mount_zfs.c index fc9220950647..283074daf717 100644 --- a/sys/contrib/openzfs/cmd/mount_zfs.c +++ b/sys/contrib/openzfs/cmd/mount_zfs.c @@ -269,8 +269,7 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - if (!zfsutil || sloppy || - libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); } @@ -337,7 +336,7 @@ main(int argc, char **argv) dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); if (!fake) { - if (zfsutil && !sloppy && + if (!remount && !sloppy && !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint); if (error) { diff --git a/sys/contrib/openzfs/cmd/raidz_test/Makefile.am b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am index 3b8b60568323..635216d65d73 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/Makefile.am +++ b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am @@ -1,5 +1,5 @@ raidz_test_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) -raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) +raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) bin_PROGRAMS += raidz_test CPPCHECKTARGETS += raidz_test diff --git a/sys/contrib/openzfs/cmd/zdb/Makefile.am b/sys/contrib/openzfs/cmd/zdb/Makefile.am index ebdc19128e1a..8a4388bd1884 100644 --- a/sys/contrib/openzfs/cmd/zdb/Makefile.am +++ b/sys/contrib/openzfs/cmd/zdb/Makefile.am @@ -1,4 +1,4 @@ -zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) +zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) zdb_CFLAGS = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS) sbin_PROGRAMS += zdb diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index e96f7b9ab2f7..aa80c46e6079 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -33,7 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, Rob Norris */ @@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa) } static void -dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, + uint64_t index) { - const ddt_phys_t *ddp = dde->dde_phys; - const ddt_key_t *ddk = &dde->dde_key; - const char *types[4] = { "ditto", "single", "double", "triple" }; + const ddt_key_t *ddk = &ddlwe->ddlwe_key; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; int p; - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) + for (p = 0; p < DDT_NPHYS(ddt); p++) { + const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); - (void) printf("index %llx refcnt %llu %s %s\n", - (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, - types[p], blkbuf); + (void) printf("index %llx refcnt %llu phys %d %s\n", + (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), + p, blkbuf); } } @@ -1956,11 +1958,37 @@ dump_dedup_ratio(const ddt_stat_t *dds) dedup, compress, copies, dedup * compress / copies); } +static void +dump_ddt_log(ddt_t *ddt) +{ + for (int n = 0; n < 2; n++) { + ddt_log_t *ddl = &ddt->ddt_log[n]; + + uint64_t count = avl_numnodes(&ddl->ddl_tree); + if (count == 0) + continue; + + printf(DMU_POOL_DDT_LOG ": %lu log entries\n", + zio_checksum_table[ddt->ddt_checksum].ci_name, n, count); + + if (dump_opt['D'] < 4) + continue; + + ddt_lightweight_entry_t ddlwe; + uint64_t index = 0; + for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) { + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + dump_ddt_entry(ddt, &ddlwe, index++); + } + } +} + static void dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; - ddt_entry_t dde; + ddt_lightweight_entry_t ddlwe; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; @@ -2001,8 +2029,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) (void) printf("%s contents:\n\n", name); - while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) - dump_dde(ddt, &dde, walk); + while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0) + dump_ddt_entry(ddt, &ddlwe, walk); ASSERT3U(error, ==, ENOENT); @@ -2017,7 +2045,7 @@ dump_all_ddts(spa_t *spa) for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; @@ -2025,6 +2053,7 @@ dump_all_ddts(spa_t *spa) dump_ddt(ddt, type, class); } } + dump_ddt_log(ddt); } ddt_get_dedup_stats(spa, &dds_total); @@ -2043,6 +2072,32 @@ dump_all_ddts(spa_t *spa) } dump_dedup_ratio(&dds_total); + + /* + * Dump a histogram of unique class entry age + */ + if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) { + ddt_age_histo_t histogram; + + (void) printf("DDT walk unique, building age histogram...\n"); + ddt_prune_walk(spa, 0, &histogram); + + /* + * print out histogram for unique entry class birth + */ + if (histogram.dah_entries > 0) { + (void) printf("%5s %9s %4s\n", + "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", + "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9d %4d%%\n", 1 << i, + (int)histogram.dah_age_histo[i], + (int)((histogram.dah_age_histo[i] * 100) / + histogram.dah_entries)); + } + } + } } static void @@ -3287,9 +3342,45 @@ fuid_table_destroy(void) } } +/* + * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on + * a live pool are normally cleaned up during ddt_sync(). We can't do that (and + * wouldn't want to anyway), but if we don't clean up the presence of stuff on + * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. + * + * Note that this is not a particularly efficient way to do this, but + * ddt_remove() is the only public method that can do the work we need, and it + * requires the right locks and etc to do the job. This is only ever called + * during zdb shutdown so efficiency is not especially important. + */ +static void +zdb_ddt_cleanup(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + ddt_enter(ddt); + ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; + while (dde) { + next = AVL_NEXT(&ddt->ddt_tree, dde); + dde->dde_io = NULL; + ddt_remove(ddt, dde); + dde = next; + } + ddt_exit(ddt); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } +} + static void zdb_exit(int reason) { + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { @@ -4592,7 +4683,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, l2arc_log_blk_phys_t this_lb; uint64_t asize; l2arc_log_blkptr_t lbps[2]; - abd_t *abd; zio_cksum_t cksum; int failed = 0; l2arc_dev_t dev; @@ -4646,20 +4736,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { case ZIO_COMPRESS_OFF: break; - default: - abd = abd_alloc_for_io(asize, B_TRUE); + default: { + abd_t *abd = abd_alloc_linear(asize, B_TRUE); abd_copy_from_buf_off(abd, &this_lb, 0, asize); - if (zio_decompress_data(L2BLK_GET_COMPRESS( - (&lbps[0])->lbp_prop), abd, &this_lb, - asize, sizeof (this_lb), NULL) != 0) { + abd_t dabd; + abd_get_from_buf_struct(&dabd, &this_lb, + sizeof (this_lb)); + int err = zio_decompress_data(L2BLK_GET_COMPRESS( + (&lbps[0])->lbp_prop), abd, &dabd, + asize, sizeof (this_lb), NULL); + abd_free(&dabd); + abd_free(abd); + if (err != 0) { (void) printf("L2ARC block decompression " "failed\n"); - abd_free(abd); goto out; } - abd_free(abd); break; } + } if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); @@ -5633,7 +5728,6 @@ static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { - uint64_t refcnt = 0; int i; ASSERT(type < ZDB_OT_TOTAL); @@ -5641,8 +5735,167 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; + /* + * This flag controls if we will issue a claim for the block while + * counting it, to ensure that all blocks are referenced in space maps. + * We don't issue claims if we're not doing leak tracking, because it's + * expensive if the user isn't interested. We also don't claim the + * second or later occurences of cloned or dedup'd blocks, because we + * already claimed them the first time. + */ + boolean_t do_claim = !dump_opt['L']; + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + blkptr_t tempbp; + if (BP_GET_DEDUP(bp)) { + /* + * Dedup'd blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * We use the existing dedup system to track what we've seen. + * The first time we see a block, we do a ddt_lookup() to see + * if it exists in the DDT. If we're doing leak tracking, we + * claim the block at this time. + * + * Each time we see a block, we reduce the refcount in the + * entry by one, and add to the size and count of dedup'd + * blocks to report at the end. + */ + + ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); + + ddt_enter(ddt); + + /* + * Find the block. This will create the entry in memory, but + * we'll know if that happened by its refcount. + */ + ddt_entry_t *dde = ddt_lookup(ddt, bp); + + /* + * ddt_lookup() can return NULL if this block didn't exist + * in the DDT and creating it would take the DDT over its + * quota. Since we got the block from disk, it must exist in + * the DDT, so this can't happen. However, when unique entries + * are pruned, the dedup bit can be set with no corresponding + * entry in the DDT. + */ + if (dde == NULL) { + ddt_exit(ddt); + goto skipped; + } + + /* Get the phys for this variant */ + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + + /* + * This entry may have multiple sets of DVAs. We must claim + * each set the first time we see them in a real block on disk, + * or count them on subsequent occurences. We don't have a + * convenient way to track the first time we see each variant, + * so we repurpose dde_io as a set of "seen" flag bits. We can + * do this safely in zdb because it never writes, so it will + * never have a writing zio for this block in that pointer. + */ + boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v)); + if (!seen) + dde->dde_io = + (void *)(((uintptr_t)dde->dde_io) | (1 << v)); + + /* Consume a reference for this block. */ + if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) + ddt_phys_decref(dde->dde_phys, v); + + /* + * If this entry has a single flat phys, it may have been + * extended with additional DVAs at some time in its life. + * This block might be from before it was fully extended, and + * so have fewer DVAs. + * + * If this is the first time we've seen this block, and we + * claimed it as-is, then we would miss the claim on some + * number of DVAs, which would then be seen as leaked. + * + * In all cases, if we've had fewer DVAs, then the asize would + * be too small, and would lead to the pool apparently using + * more space than allocated. + * + * To handle this, we copy the canonical set of DVAs from the + * entry back to the block pointer before we claim it. + */ + if (v == DDT_PHYS_FLAT) { + ASSERT3U(BP_GET_BIRTH(bp), ==, + ddt_phys_birth(dde->dde_phys, v)); + tempbp = *bp; + ddt_bp_fill(dde->dde_phys, v, &tempbp, + BP_GET_BIRTH(bp)); + bp = &tempbp; + } + + if (seen) { + /* + * The second or later time we see this block, + * it's a duplicate and we count it. + */ + zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); + zcb->zcb_dedup_blocks++; + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + + ddt_exit(ddt); + } else if (zcb->zcb_brt_is_active && + brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre == NULL) { + /* Not seen before; track it */ + uint64_t refcnt = + brt_entry_get_refcount(zcb->zcb_spa, bp); + if (refcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = refcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } else { + /* + * Second or later occurrence, count it and take a + * refcount. + */ + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + } + +skipped: for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -5745,71 +5998,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); - if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { - /* - * Cloned blocks are special. We need to count them, so we can - * later uncount them when reporting leaked space, and we must - * only claim them them once. - * - * To do this, we keep our own in-memory BRT. For each block - * we haven't seen before, we look it up in the real BRT and - * if its there, we note it and its refcount then proceed as - * normal. If we see the block again, we count it as a clone - * and then give it no further consideration. - */ - zdb_brt_entry_t zbre_search, *zbre; - avl_index_t where; - - zbre_search.zbre_dva = bp->blk_dva[0]; - zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); - if (zbre != NULL) { - zcb->zcb_clone_asize += BP_GET_ASIZE(bp); - zcb->zcb_clone_blocks++; - - zbre->zbre_refcount--; - if (zbre->zbre_refcount == 0) { - avl_remove(&zcb->zcb_brt, zbre); - umem_free(zbre, sizeof (zdb_brt_entry_t)); - } - return; - } - - uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); - if (crefcnt > 0) { - zbre = umem_zalloc(sizeof (zdb_brt_entry_t), - UMEM_NOFAIL); - zbre->zbre_dva = bp->blk_dva[0]; - zbre->zbre_refcount = crefcnt; - avl_insert(&zcb->zcb_brt, zbre, where); - } - } - - if (dump_opt['L']) + if (!do_claim) return; - if (BP_GET_DEDUP(bp)) { - ddt_t *ddt; - ddt_entry_t *dde; - - ddt = ddt_select(zcb->zcb_spa, bp); - ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_FALSE); - - if (dde == NULL) { - refcnt = 0; - } else { - ddt_phys_t *ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; - if (ddt_phys_total_refcnt(dde) == 0) - ddt_remove(ddt, dde); - } - ddt_exit(ddt); - } - - VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, - refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), - bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); + VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, + spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, + ZIO_FLAG_CANFAIL))); } static void @@ -6120,49 +6314,6 @@ zdb_load_obsolete_counts(vdev_t *vd) return (counts); } -static void -zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) -{ - ddt_bookmark_t ddb = {0}; - ddt_entry_t dde; - int error; - int p; - - ASSERT(!dump_opt['L']); - - while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { - blkptr_t blk; - ddt_phys_t *ddp = dde.dde_phys; - - if (ddb.ddb_class == DDT_CLASS_UNIQUE) - return; - - ASSERT(ddt_phys_total_refcnt(&dde) > 1); - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - VERIFY(ddt); - - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(ddb.ddb_checksum, - &dde.dde_key, ddp, &blk); - if (p == DDT_PHYS_DITTO) { - zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); - } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); - zcb->zcb_dedup_blocks++; - } - } - - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } - - ASSERT(error == ENOENT); -} - typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; @@ -6546,10 +6697,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - zdb_ddt_leak_init(spa, zcb); - spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t @@ -6814,6 +6961,8 @@ dump_block_stats(spa_t *spa) int e, c, err; bp_embedded_type_t i; + ddt_prefetch_all(spa); + zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { @@ -6938,7 +7087,6 @@ dump_block_stats(spa_t *spa) (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); - leaks = B_TRUE; } if (tzb->zb_count == 0) { @@ -7272,29 +7420,27 @@ dump_simulated_ddt(spa_t *spa) spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { - ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); - dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; - dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; - dds.dds_psize = zdde->zdde_ref_psize / refcnt; - dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1]; - dds.dds_ref_blocks = zdde->zdde_ref_blocks; - dds.dds_ref_lsize = zdde->zdde_ref_lsize; - dds.dds_ref_psize = zdde->zdde_ref_psize; - dds.dds_ref_dsize = zdde->zdde_ref_dsize; + dds->dds_blocks += zdde->zdde_ref_blocks / refcnt; + dds->dds_lsize += zdde->zdde_ref_lsize / refcnt; + dds->dds_psize += zdde->zdde_ref_psize / refcnt; + dds->dds_dsize += zdde->zdde_ref_dsize / refcnt; - ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], - &dds, 0); + dds->dds_ref_blocks += zdde->zdde_ref_blocks; + dds->dds_ref_lsize += zdde->zdde_ref_lsize; + dds->dds_ref_psize += zdde->zdde_ref_psize; + dds->dds_ref_dsize += zdde->zdde_ref_dsize; umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); - ddt_histogram_stat(&dds_total, &ddh_total); + ddt_histogram_total(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); @@ -8022,16 +8168,28 @@ dump_mos_leaks(spa_t *spa) mos_leak_vdev(spa->spa_root_vdev); - for (uint64_t class = 0; class < DDT_CLASSES; class++) { - for (uint64_t type = 0; type < DDT_TYPES; type++) { - for (uint64_t cksum = 0; - cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { - ddt_t *ddt = spa->spa_ddt[cksum]; - if (!ddt) - continue; + for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) + continue; + + /* DDT store objects */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { mos_obj_refd(ddt->ddt_object[type][class]); } } + + /* FDT container */ + if (ddt->ddt_version == DDT_VERSION_FDT) + mos_obj_refd(ddt->ddt_dir_object); + + /* FDT log objects */ + if (ddt->ddt_flags & DDT_FLAG_LOG) { + mos_obj_refd(ddt->ddt_log[0].ddl_object); + mos_obj_refd(ddt->ddt_log[1].ddl_object); + } } if (spa->spa_brt != NULL) { @@ -8499,13 +8657,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize, memset(lbuf, 0x00, lsize); memset(lbuf2, 0xff, lsize); + abd_t labd, labd2; + abd_get_from_buf_struct(&labd, lbuf, lsize); + abd_get_from_buf_struct(&labd2, lbuf2, lsize); + + boolean_t ret = B_FALSE; if (zio_decompress_data(cfunc, pabd, - lbuf, psize, lsize, NULL) == 0 && + &labd, psize, lsize, NULL) == 0 && zio_decompress_data(cfunc, pabd, - lbuf2, psize, lsize, NULL) == 0 && + &labd2, psize, lsize, NULL) == 0 && memcmp(lbuf, lbuf2, lsize) == 0) - return (B_TRUE); - return (B_FALSE); + ret = B_TRUE; + + abd_free(&labd2); + abd_free(&labd); + + return (ret); } static uint64_t @@ -9624,6 +9791,9 @@ main(int argc, char **argv) } fini: + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c index e0ad00800add..e35cd0756c60 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c @@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) const char *failmode = NULL; boolean_t checkremove = B_FALSE; uint32_t pri = 0; - int32_t flags = 0; /* * If this is a checksum or I/O error, then toss it into the @@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + uint64_t flags = 0; + int32_t flags32 = 0; /* * We ignore ereports for checksum errors generated by * scrub/resilver I/O to avoid potentially further * degrading the pool while it's being repaired. + * + * Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to + * be int32. To allow newer zed to work on older + * kernels, if we don't find the flags, we look for + * the older ones too. */ if (((nvlist_lookup_uint32(nvl, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) && (pri == ZIO_PRIORITY_SCRUB || pri == ZIO_PRIORITY_REBUILD)) || - ((nvlist_lookup_int32(nvl, + ((nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) && - (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) { + (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) || + ((nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) && + (flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) { fmd_hdl_debug(hdl, "ignoring '%s' for " "scrub/resilver I/O", class); return; diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 620746f8e7bb..349c208c521b 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -75,6 +75,7 @@ #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" +#include "zfs_valstr.h" #include "statcommon.h" @@ -130,6 +131,8 @@ static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); +static int zpool_do_ddt_prune(int, char **); + static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( @@ -170,6 +173,7 @@ typedef enum { HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, @@ -426,6 +430,8 @@ static zpool_command_t command_table[] = { { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, + { NULL }, + { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) @@ -537,7 +543,7 @@ get_usage(zpool_help_t idx) "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: - return (gettext("\treguid \n")); + return (gettext("\treguid [-g guid] \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: @@ -545,6 +551,8 @@ get_usage(zpool_help_t idx) case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); + case HELP_DDT_PRUNE: + return (gettext("\tddtprune -d|-p \n")); default: __builtin_unreachable(); } @@ -2025,7 +2033,7 @@ zpool_do_create(int argc, char **argv) char *end; u_longlong_t ver; - ver = strtoull(propval, &end, 10); + ver = strtoull(propval, &end, 0); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; @@ -8232,19 +8240,32 @@ zpool_do_clear(int argc, char **argv) } /* - * zpool reguid + * zpool reguid [-g ] */ int zpool_do_reguid(int argc, char **argv) { + uint64_t guid; + uint64_t *guidp = NULL; int c; + char *endptr; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ - while ((c = getopt(argc, argv, "")) != -1) { + while ((c = getopt(argc, argv, "g:")) != -1) { switch (c) { + case 'g': + errno = 0; + guid = strtoull(optarg, &endptr, 10); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("invalid GUID: %s\n"), optarg); + usage(B_FALSE); + } + guidp = &guid; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -8270,7 +8291,7 @@ zpool_do_reguid(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); - ret = zpool_reguid(zhp); + ret = zpool_set_guid(zhp, guidp); zpool_close(zhp); return (ret); @@ -11916,6 +11937,7 @@ static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; + static char flagstr[256]; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { @@ -11975,7 +11997,21 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); - printf(gettext("0x%x"), i32); + if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE) == 0 || + strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE) == 0) { + zfs_valstr_zio_stage(i32, flagstr, + sizeof (flagstr)); + printf(gettext("0x%x [%s]"), i32, flagstr); + } else if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY) == 0) { + zfs_valstr_zio_priority(i32, flagstr, + sizeof (flagstr)); + printf(gettext("0x%x [%s]"), i32, flagstr); + } else { + printf(gettext("0x%x"), i32); + } break; case DATA_TYPE_INT64: @@ -11996,6 +12032,12 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); + } else if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS) == 0) { + zfs_valstr_zio_flag(i64, flagstr, + sizeof (flagstr)); + printf(gettext("0x%llx [%s]"), + (u_longlong_t)i64, flagstr); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } @@ -13329,6 +13371,88 @@ found:; return (error); } +/* + * zpool ddtprune -d|-p + * + * -d Prune entries old and older + * -p Prune amount of entries + * + * Prune single reference entries from DDT to satisfy the amount specified. + */ +int +zpool_do_ddt_prune(int argc, char **argv) +{ + zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE; + uint64_t amount = 0; + zpool_handle_t *zhp; + char *endptr; + int c; + + while ((c = getopt(argc, argv, "d:p:")) != -1) { + switch (c) { + case 'd': + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with -p option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || amount == 0) { + (void) fprintf(stderr, + gettext("invalid days value\n")); + usage(B_FALSE); + } + amount *= 86400; /* convert days to seconds */ + unit = ZPOOL_DDT_PRUNE_AGE; + break; + case 'p': + if (unit == ZPOOL_DDT_PRUNE_AGE) { + (void) fprintf(stderr, gettext("-p cannot be " + "combined with -d option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || + amount == 0 || amount > 100) { + (void) fprintf(stderr, + gettext("invalid percentage value\n")); + usage(B_FALSE); + } + unit = ZPOOL_DDT_PRUNE_PERCENTAGE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (unit == ZPOOL_DDT_PRUNE_NONE) { + (void) fprintf(stderr, + gettext("missing amount option (-d|-p )\n")); + usage(B_FALSE); + } else if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } else if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + zhp = zpool_open(g_zfs, argv[0]); + if (zhp == NULL) + return (-1); + + int error = zpool_ddt_prune(zhp, unit, amount); + + zpool_close(zhp); + + return (error); +} + static int find_command_idx(const char *command, int *idx) { diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am index 8506b351165e..be3539fe905d 100644 --- a/sys/contrib/openzfs/cmd/zstream/Makefile.am +++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am @@ -1,3 +1,5 @@ +zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) + sbin_PROGRAMS += zstream CPPCHECKTARGETS += zstream diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_decompress.c b/sys/contrib/openzfs/cmd/zstream/zstream_decompress.c index 0cef36c0441f..c64011e3822a 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream_decompress.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream_decompress.c @@ -22,6 +22,8 @@ /* * Copyright 2022 Axcient. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2024, Klara, Inc. */ #include @@ -257,83 +259,73 @@ zstream_do_decompress(int argc, char *argv[]) ENTRY e = {.key = key}; p = hsearch(e, FIND); - if (p != NULL) { - zio_decompress_func_t *xfunc = NULL; - switch ((enum zio_compress)(intptr_t)p->data) { - case ZIO_COMPRESS_OFF: - xfunc = NULL; - break; - case ZIO_COMPRESS_LZJB: - xfunc = lzjb_decompress; - break; - case ZIO_COMPRESS_GZIP_1: - xfunc = gzip_decompress; - break; - case ZIO_COMPRESS_ZLE: - xfunc = zle_decompress; - break; - case ZIO_COMPRESS_LZ4: - xfunc = lz4_decompress_zfs; - break; - case ZIO_COMPRESS_ZSTD: - xfunc = zfs_zstd_decompress; - break; - default: - assert(B_FALSE); - } - - - /* - * Read and decompress the block - */ - char *lzbuf = safe_calloc(payload_size); - (void) sfread(lzbuf, payload_size, stdin); - if (xfunc == NULL) { - memcpy(buf, lzbuf, payload_size); - drrw->drr_compressiontype = - ZIO_COMPRESS_OFF; - if (verbose) - fprintf(stderr, "Resetting " - "compression type to off " - "for ino %llu offset " - "%llu\n", - (u_longlong_t) - drrw->drr_object, - (u_longlong_t) - drrw->drr_offset); - } else if (0 != xfunc(lzbuf, buf, - payload_size, payload_size, 0)) { - /* - * The block must not be compressed, - * at least not with this compression - * type, possibly because it gets - * written multiple times in this - * stream. - */ - warnx("decompression failed for " - "ino %llu offset %llu", - (u_longlong_t)drrw->drr_object, - (u_longlong_t)drrw->drr_offset); - memcpy(buf, lzbuf, payload_size); - } else if (verbose) { - drrw->drr_compressiontype = - ZIO_COMPRESS_OFF; - fprintf(stderr, "successfully " - "decompressed ino %llu " - "offset %llu\n", - (u_longlong_t)drrw->drr_object, - (u_longlong_t)drrw->drr_offset); - } else { - drrw->drr_compressiontype = - ZIO_COMPRESS_OFF; - } - free(lzbuf); - } else { + if (p == NULL) { /* * Read the contents of the block unaltered */ (void) sfread(buf, payload_size, stdin); + break; } + + /* + * Read and decompress the block + */ + enum zio_compress c = + (enum zio_compress)(intptr_t)p->data; + + if (c == ZIO_COMPRESS_OFF) { + (void) sfread(buf, payload_size, stdin); + drrw->drr_compressiontype = 0; + drrw->drr_compressed_size = 0; + if (verbose) + fprintf(stderr, + "Resetting compression type to " + "off for ino %llu offset %llu\n", + (u_longlong_t)drrw->drr_object, + (u_longlong_t)drrw->drr_offset); + break; + } + + uint64_t lsize = drrw->drr_logical_size; + ASSERT3U(payload_size, <=, lsize); + + char *lzbuf = safe_calloc(payload_size); + (void) sfread(lzbuf, payload_size, stdin); + + abd_t sabd, dabd; + abd_get_from_buf_struct(&sabd, lzbuf, payload_size); + abd_get_from_buf_struct(&dabd, buf, lsize); + int err = zio_decompress_data(c, &sabd, &dabd, + payload_size, lsize, NULL); + abd_free(&dabd); + abd_free(&sabd); + + if (err == 0) { + drrw->drr_compressiontype = 0; + drrw->drr_compressed_size = 0; + payload_size = lsize; + if (verbose) { + fprintf(stderr, + "successfully decompressed " + "ino %llu offset %llu\n", + (u_longlong_t)drrw->drr_object, + (u_longlong_t)drrw->drr_offset); + } + } else { + /* + * The block must not be compressed, at least + * not with this compression type, possibly + * because it gets written multiple times in + * this stream. + */ + warnx("decompression failed for " + "ino %llu offset %llu", + (u_longlong_t)drrw->drr_object, + (u_longlong_t)drrw->drr_offset); + memcpy(buf, lzbuf, payload_size); + } + + free(lzbuf); break; } diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c b/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c index f9e01d1aa4c4..ae2c56320b2a 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream_recompress.c @@ -22,10 +22,9 @@ /* * Copyright 2022 Axcient. All rights reserved. * Use is subject to license terms. - */ - -/* + * * Copyright (c) 2022 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[]) dmu_replay_record_t *drr = &thedrr; zio_cksum_t stream_cksum; int c; - int level = -1; + int level = 0; while ((c = getopt(argc, argv, "l:")) != -1) { switch (c) { @@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[]) if (argc != 1) zstream_usage(); - int type = 0; - zio_compress_info_t *cinfo = NULL; - if (0 == strcmp(argv[0], "off")) { - type = ZIO_COMPRESS_OFF; - cinfo = &zio_compress_table[type]; - } else if (0 == strcmp(argv[0], "inherit") || - 0 == strcmp(argv[0], "empty") || - 0 == strcmp(argv[0], "on")) { - // Fall through to invalid compression type case - } else { - for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) { - if (0 == strcmp(zio_compress_table[i].ci_name, - argv[0])) { - cinfo = &zio_compress_table[i]; - type = i; - break; - } - } - } - if (cinfo == NULL) { - fprintf(stderr, "Invalid compression type %s.\n", - argv[0]); - exit(2); - } - if (cinfo->ci_compress == NULL) { - type = 0; - cinfo = &zio_compress_table[0]; + enum zio_compress ctype; + if (strcmp(argv[0], "off") == 0) { + ctype = ZIO_COMPRESS_OFF; + } else { + for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) { + if (strcmp(argv[0], + zio_compress_table[ctype].ci_name) == 0) + break; + } + if (ctype == ZIO_COMPRESS_FUNCTIONS || + zio_compress_table[ctype].ci_compress == NULL) { + fprintf(stderr, "Invalid compression type %s.\n", + argv[0]); + exit(2); + } } if (isatty(STDIN_FILENO)) { @@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[]) exit(1); } + abd_init(); fletcher_4_init(); zio_init(); zstd_init(); @@ -247,63 +235,78 @@ zstream_do_recompress(int argc, char *argv[]) (void) sfread(buf, payload_size, stdin); break; } - if (drrw->drr_compressiontype >= - ZIO_COMPRESS_FUNCTIONS) { + enum zio_compress dtype = drrw->drr_compressiontype; + if (dtype >= ZIO_COMPRESS_FUNCTIONS) { fprintf(stderr, "Invalid compression type in " - "stream: %d\n", drrw->drr_compressiontype); + "stream: %d\n", dtype); exit(3); } - zio_compress_info_t *dinfo = - &zio_compress_table[drrw->drr_compressiontype]; + if (zio_compress_table[dtype].ci_decompress == NULL) + dtype = ZIO_COMPRESS_OFF; /* Set up buffers to minimize memcpys */ char *cbuf, *dbuf; - if (cinfo->ci_compress == NULL) + if (ctype == ZIO_COMPRESS_OFF) dbuf = buf; else dbuf = safe_calloc(bufsz); - if (dinfo->ci_decompress == NULL) + if (dtype == ZIO_COMPRESS_OFF) cbuf = dbuf; else cbuf = safe_calloc(payload_size); /* Read and decompress the payload */ (void) sfread(cbuf, payload_size, stdin); - if (dinfo->ci_decompress != NULL) { - if (0 != dinfo->ci_decompress(cbuf, dbuf, - payload_size, MIN(bufsz, - drrw->drr_logical_size), dinfo->ci_level)) { + if (dtype != ZIO_COMPRESS_OFF) { + abd_t cabd, dabd; + abd_get_from_buf_struct(&cabd, + cbuf, payload_size); + abd_get_from_buf_struct(&dabd, dbuf, + MIN(bufsz, drrw->drr_logical_size)); + if (zio_decompress_data(dtype, &cabd, &dabd, + payload_size, abd_get_size(&dabd), + NULL) != 0) { warnx("decompression type %d failed " "for ino %llu offset %llu", - type, + dtype, (u_longlong_t)drrw->drr_object, (u_longlong_t)drrw->drr_offset); exit(4); } payload_size = drrw->drr_logical_size; + abd_free(&dabd); + abd_free(&cabd); free(cbuf); } /* Recompress the payload */ - if (cinfo->ci_compress != NULL) { - payload_size = P2ROUNDUP(cinfo->ci_compress( - dbuf, buf, drrw->drr_logical_size, - MIN(payload_size, bufsz), (level == -1 ? - cinfo->ci_level : level)), - SPA_MINBLOCKSIZE); - if (payload_size != drrw->drr_logical_size) { - drrw->drr_compressiontype = type; - drrw->drr_compressed_size = - payload_size; - } else { + if (ctype != ZIO_COMPRESS_OFF) { + abd_t dabd, abd; + abd_get_from_buf_struct(&dabd, + dbuf, drrw->drr_logical_size); + abd_t *pabd = + abd_get_from_buf_struct(&abd, buf, bufsz); + size_t csize = zio_compress_data(ctype, &dabd, + &pabd, drrw->drr_logical_size, level); + size_t rounded = + P2ROUNDUP(csize, SPA_MINBLOCKSIZE); + if (rounded >= drrw->drr_logical_size) { memcpy(buf, dbuf, payload_size); drrw->drr_compressiontype = 0; drrw->drr_compressed_size = 0; + } else { + abd_zero_off(pabd, csize, + rounded - csize); + drrw->drr_compressiontype = ctype; + drrw->drr_compressed_size = + payload_size = rounded; } + abd_free(&abd); + abd_free(&dabd); free(dbuf); } else { - drrw->drr_compressiontype = type; + drrw->drr_compressiontype = 0; drrw->drr_compressed_size = 0; } break; @@ -371,6 +374,7 @@ zstream_do_recompress(int argc, char *argv[]) fletcher_4_fini(); zio_fini(); zstd_fini(); + abd_fini(); return (0); } diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c index eb68c27b1dc1..ce031632e758 100644 --- a/sys/contrib/openzfs/cmd/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest.c @@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; extern uint64_t raidz_expand_max_reflow_bytes; extern uint_t raidz_expand_pause_point; +extern boolean_t ddt_prune_artificial_age; +extern boolean_t ddt_dump_prune_histogram; static ztest_shared_opts_t *ztest_shared_opts; @@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; +ztest_func_t ztest_ddt_prune; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), + ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -6747,7 +6751,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) load = spa_load_guid(spa); (void) pthread_rwlock_wrlock(&ztest_name_lock); - error = spa_change_guid(spa); + error = spa_change_guid(spa, NULL); zs->zs_guid = spa_guid(spa); (void) pthread_rwlock_unlock(&ztest_name_lock); @@ -7289,6 +7293,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_vdev_lock); } +void +ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + + spa_t *spa = ztest_spa; + uint64_t pct = ztest_random(15) + 1; + + (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); +} + /* * Verify pool integrity by running zdb. */ @@ -7470,6 +7485,13 @@ ztest_resume_thread(void *arg) { spa_t *spa = arg; + /* + * Synthesize aged DDT entries for ddt prune testing + */ + ddt_prune_artificial_age = B_TRUE; + if (ztest_opts.zo_verbose >= 3) + ddt_dump_prune_histogram = B_TRUE; + while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); @@ -8588,6 +8610,12 @@ ztest_init(ztest_shared_t *zs) if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; + /* + * split 50/50 between legacy and fast dedup + */ + if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) + continue; + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); diff --git a/sys/contrib/openzfs/config/Rules.am b/sys/contrib/openzfs/config/Rules.am index 00ac890e2303..9c0714c82513 100644 --- a/sys/contrib/openzfs/config/Rules.am +++ b/sys/contrib/openzfs/config/Rules.am @@ -10,7 +10,8 @@ AM_CPPFLAGS = \ -I$(top_srcdir)/include \ -I$(top_srcdir)/module/icp/include \ -I$(top_srcdir)/lib/libspl/include \ - -I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ + -I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \ + -I$(top_srcdir)/lib/libzpool/include AM_LIBTOOLFLAGS = --silent @@ -70,4 +71,7 @@ KERNEL_CFLAGS = $(FRAME_LARGER_THAN) LIBRARY_CFLAGS = -no-suppress # Forcibly enable asserts/debugging for libzpool &al. -FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG +# Since ZFS_DEBUG can change shared data structures, all libzpool users must +# be compiled with the same flags. +# See https://github.com/openzfs/zfs/issues/16476 +LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install index 10083351abb5..d51e4ef003e6 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install @@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8 usr/share/man/man8/zpool-create.8 usr/share/man/man8/zpool-destroy.8 usr/share/man/man8/zpool-detach.8 +usr/share/man/man8/zpool-ddtprune.8 usr/share/man/man8/zpool-events.8 usr/share/man/man8/zpool-export.8 usr/share/man/man8/zpool-get.8 diff --git a/sys/contrib/openzfs/include/Makefile.am b/sys/contrib/openzfs/include/Makefile.am index fa725c2e7a5f..f173064efc99 100644 --- a/sys/contrib/openzfs/include/Makefile.am +++ b/sys/contrib/openzfs/include/Makefile.am @@ -14,6 +14,7 @@ COMMON_H = \ zfs_fletcher.h \ zfs_namecheck.h \ zfs_prop.h \ + zfs_valstr.h \ \ sys/abd.h \ sys/abd_impl.h \ diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index bf5579f38fda..01d51999f4eb 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -300,10 +300,14 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *); +_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, + uint64_t); + _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); diff --git a/sys/contrib/openzfs/include/libzfs_core.h b/sys/contrib/openzfs/include/libzfs_core.h index 206e5e5c2bf6..b1d74fbbc8f5 100644 --- a/sys/contrib/openzfs/include/libzfs_core.h +++ b/sys/contrib/openzfs/include/libzfs_core.h @@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, + uint64_t); + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/os/freebsd/Makefile.am b/sys/contrib/openzfs/include/os/freebsd/Makefile.am index 292f79b8ce72..d975c4fe69fa 100644 --- a/sys/contrib/openzfs/include/os/freebsd/Makefile.am +++ b/sys/contrib/openzfs/include/os/freebsd/Makefile.am @@ -77,6 +77,8 @@ noinst_HEADERS = \ %D%/spl/sys/zmod.h \ %D%/spl/sys/zone.h \ \ + %D%/zfs/sys/abd_os.h \ + %D%/zfs/sys/abd_impl_os.h \ %D%/zfs/sys/arc_os.h \ %D%/zfs/sys/freebsd_crypto.h \ %D%/zfs/sys/freebsd_event.h \ diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_impl_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_impl_os.h new file mode 100644 index 000000000000..309e77110d3c --- /dev/null +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_impl_os.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#ifndef _ABD_IMPL_OS_H +#define _ABD_IMPL_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define abd_enter_critical(flags) critical_enter() +#define abd_exit_critical(flags) critical_exit() + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_OS_H */ diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_os.h new file mode 100644 index 000000000000..57122ee83e8d --- /dev/null +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/abd_os.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_OS_H +#define _ABD_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd_scatter { + uint_t abd_offset; + void *abd_chunks[1]; /* actually variable-length */ +}; + +struct abd_linear { + void *abd_buf; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/sys/contrib/openzfs/include/os/linux/Makefile.am b/sys/contrib/openzfs/include/os/linux/Makefile.am index f31ae50b96af..9100aebb541e 100644 --- a/sys/contrib/openzfs/include/os/linux/Makefile.am +++ b/sys/contrib/openzfs/include/os/linux/Makefile.am @@ -20,6 +20,8 @@ kernel_linux_HEADERS = \ kernel_sysdir = $(kerneldir)/sys kernel_sys_HEADERS = \ + %D%/zfs/sys/abd_os.h \ + %D%/zfs/sys/abd_impl_os.h \ %D%/zfs/sys/policy.h \ %D%/zfs/sys/trace_acl.h \ %D%/zfs/sys/trace_arc.h \ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h index b73dab631e04..8051de36ba82 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h @@ -20,6 +20,10 @@ * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ +/* + * Copyright (c) 2024, Klara Inc. + * Copyright (c) 2024, Syneto + */ #ifndef _SPL_TASKQ_H #define _SPL_TASKQ_H @@ -33,6 +37,9 @@ #include #include #include +#include + +typedef struct kstat_s kstat_t; #define TASKQ_NAMELEN 31 @@ -74,6 +81,32 @@ typedef enum tq_lock_role { typedef unsigned long taskqid_t; typedef void (task_func_t)(void *); +typedef struct taskq_sums { + /* gauges (inc/dec counters, current value) */ + wmsum_t tqs_threads_active; /* threads running a task */ + wmsum_t tqs_threads_idle; /* threads waiting for work */ + wmsum_t tqs_threads_total; /* total threads */ + wmsum_t tqs_tasks_pending; /* tasks waiting to execute */ + wmsum_t tqs_tasks_priority; /* hi-pri tasks waiting */ + wmsum_t tqs_tasks_total; /* total waiting tasks */ + wmsum_t tqs_tasks_delayed; /* tasks deferred to future */ + wmsum_t tqs_entries_free; /* task entries on free list */ + + /* counters (inc only, since taskq creation) */ + wmsum_t tqs_threads_created; /* threads created */ + wmsum_t tqs_threads_destroyed; /* threads destroyed */ + wmsum_t tqs_tasks_dispatched; /* tasks dispatched */ + wmsum_t tqs_tasks_dispatched_delayed; /* tasks delayed to future */ + wmsum_t tqs_tasks_executed_normal; /* normal pri tasks executed */ + wmsum_t tqs_tasks_executed_priority; /* high pri tasks executed */ + wmsum_t tqs_tasks_executed; /* total tasks executed */ + wmsum_t tqs_tasks_delayed_requeued; /* delayed tasks requeued */ + wmsum_t tqs_tasks_cancelled; /* tasks cancelled before run */ + wmsum_t tqs_thread_wakeups; /* total thread wakeups */ + wmsum_t tqs_thread_wakeups_nowork; /* thread woken but no tasks */ + wmsum_t tqs_thread_sleeps; /* total thread sleeps */ +} taskq_sums_t; + typedef struct taskq { spinlock_t tq_lock; /* protects taskq_t */ char *tq_name; /* taskq name */ @@ -105,6 +138,8 @@ typedef struct taskq { struct hlist_node tq_hp_cb_node; boolean_t tq_hp_support; unsigned long lastspawnstop; /* when to purge dynamic */ + taskq_sums_t tq_sums; + kstat_t *tq_ksp; } taskq_t; typedef struct taskq_ent { @@ -123,6 +158,13 @@ typedef struct taskq_ent { #define TQENT_FLAG_PREALLOC 0x1 #define TQENT_FLAG_CANCEL 0x2 +/* bits 2-3 are which list tqent is on */ +#define TQENT_LIST_NONE 0x0 +#define TQENT_LIST_PENDING 0x4 +#define TQENT_LIST_PRIORITY 0x8 +#define TQENT_LIST_DELAY 0xc +#define TQENT_LIST_MASK 0xc + typedef struct taskq_thread { struct list_head tqt_thread_list; struct list_head tqt_active_list; diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_impl_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_impl_os.h new file mode 100644 index 000000000000..8192522cd229 --- /dev/null +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_impl_os.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#ifndef _ABD_IMPL_OS_H +#define _ABD_IMPL_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define abd_enter_critical(flags) local_irq_save(flags) +#define abd_exit_critical(flags) local_irq_restore(flags) + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_OS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_os.h new file mode 100644 index 000000000000..ce4f5a2bdf9b --- /dev/null +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/abd_os.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_OS_H +#define _ABD_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd_scatter { + uint_t abd_offset; + uint_t abd_nents; + struct scatterlist *abd_sgl; +}; + +struct abd_linear { + void *abd_buf; + struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ +}; + +typedef struct abd abd_t; + +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); + +/* + * Linux ABD bio functions + * Note: these are only needed to support vdev_classic. See comment in + * vdev_disk.c. + */ +unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/sys/contrib/openzfs/include/sys/abd.h b/sys/contrib/openzfs/include/sys/abd.h index 7b7d84b528cd..567b88c0fc01 100644 --- a/sys/contrib/openzfs/include/sys/abd.h +++ b/sys/contrib/openzfs/include/sys/abd.h @@ -30,6 +30,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -44,8 +45,7 @@ typedef enum abd_flags { ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ - ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ - ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */ + ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */ } abd_flags_t; typedef struct abd { @@ -58,19 +58,8 @@ typedef struct abd { #endif kmutex_t abd_mtx; union { - struct abd_scatter { - uint_t abd_offset; -#if defined(__FreeBSD__) && defined(_KERNEL) - void *abd_chunks[1]; /* actually variable-length */ -#else - uint_t abd_nents; - struct scatterlist *abd_sgl; -#endif - } abd_scatter; - struct abd_linear { - void *abd_buf; - struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ - } abd_linear; + struct abd_scatter abd_scatter; + struct abd_linear abd_linear; struct abd_gang { list_t abd_gang_chain; } abd_gang; @@ -79,9 +68,6 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); -#if defined(__linux__) && defined(_KERNEL) -typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); -#endif extern int zfs_abd_scatter_enabled; @@ -107,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t); abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t); abd_t *abd_get_zeros(size_t); abd_t *abd_get_from_buf(void *, size_t); +abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t); void abd_cache_reap_now(void); /* @@ -128,10 +115,6 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); -#if defined(__linux__) && defined(_KERNEL) -int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, - void *); -#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); @@ -225,16 +208,6 @@ abd_get_size(abd_t *abd) void abd_init(void); void abd_fini(void); -/* - * Linux ABD bio functions - * Note: these are only needed to support vdev_classic. See comment in - * vdev_disk.c. - */ -#if defined(__linux__) && defined(_KERNEL) -unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); -#endif - #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/abd_impl.h b/sys/contrib/openzfs/include/sys/abd_impl.h index f88ea25e245d..1eb25d94adc5 100644 --- a/sys/contrib/openzfs/include/sys/abd_impl.h +++ b/sys/contrib/openzfs/include/sys/abd_impl.h @@ -28,6 +28,7 @@ #define _ABD_IMPL_H #include +#include #include #ifdef __cplusplus @@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *); #define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) #define ABD_GANG(abd) (abd->abd_u.abd_gang) -#if defined(_KERNEL) -#if defined(__FreeBSD__) -#define abd_enter_critical(flags) critical_enter() -#define abd_exit_critical(flags) critical_exit() -#else -#define abd_enter_critical(flags) local_irq_save(flags) -#define abd_exit_critical(flags) local_irq_restore(flags) -#endif -#else /* !_KERNEL */ -#define abd_enter_critical(flags) ((void)0) -#define abd_exit_critical(flags) ((void)0) -#endif - #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/ddt.h b/sys/contrib/openzfs/include/sys/ddt.h index 66d59cebacde..4e5ccd46318e 100644 --- a/sys/contrib/openzfs/include/sys/ddt.h +++ b/sys/contrib/openzfs/include/sys/ddt.h @@ -39,6 +39,13 @@ extern "C" { struct abd; +/* + * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). + */ +#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ +#define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */ +#define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG) + /* * DDT on-disk storage object types. Each one corresponds to specific * implementation, see ddt_ops_t. The value itself is not stored on disk. @@ -120,30 +127,80 @@ typedef struct { * characteristics of the stored block, such as its location on disk (DVAs), * birth txg and ref count. * - * Note that an entry has an array of four ddt_phys_t, one for each number of - * DVAs (copies= property) and another for additional "ditto" copies. Most - * users of ddt_phys_t will handle indexing into or counting the phys they - * want. + * The "traditional" entry has an array of four, one for each number of DVAs + * (copies= property) and another for additional "ditto" copies. Users of the + * traditional struct will specify the variant (index) of the one they want. + * + * The newer "flat" entry has only a single form that is specified using the + * DDT_PHYS_FLAT variant. + * + * Since the value size varies, use one of the size macros when interfacing + * with the ddt zap. */ -typedef struct { - dva_t ddp_dva[SPA_DVAS_PER_BP]; - uint64_t ddp_refcnt; - uint64_t ddp_phys_birth; -} ddt_phys_t; + +#define DDT_PHYS_MAX (4) /* - * Named indexes into the ddt_phys_t array in each entry. + * Note - this can be used in a flexible array and allocated for + * a specific size (ddp_trad or ddp_flat). So be careful not to + * copy using "=" assignment but instead use ddt_phys_copy(). + */ +typedef union { + /* + * Traditional physical payload value for DDT zap (256 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; + } ddp_trad[DDT_PHYS_MAX]; + + /* + * Flat physical payload value for DDT zap (72 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; /* txg based from BP */ + uint64_t ddp_class_start; /* in realtime seconds */ + } ddp_flat; +} ddt_univ_phys_t; + +/* + * This enum denotes which variant of a ddt_univ_phys_t to target. For + * a traditional DDT entry, it represents the indexes into the ddp_trad + * array. Any consumer of a ddt_univ_phys_t needs to know which variant + * is being targeted. * * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * we maintain the ability to free existing dedup-ditto blocks. */ -enum ddt_phys_type { + +typedef enum { DDT_PHYS_DITTO = 0, DDT_PHYS_SINGLE = 1, DDT_PHYS_DOUBLE = 2, DDT_PHYS_TRIPLE = 3, - DDT_PHYS_TYPES -}; + DDT_PHYS_FLAT = 4, + DDT_PHYS_NONE = 5 +} ddt_phys_variant_t; + +#define DDT_PHYS_VARIANT(ddt, p) \ + (ASSERT((p) < DDT_PHYS_NONE), \ + ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p))) + +#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad) +#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat) + +#define _DDT_PHYS_SWITCH(ddt, flat, trad) \ + (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad)) + +#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \ + DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE) + +#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX) +#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p) +#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0)) /* * A "live" entry, holding changes to an entry made this txg, and other data to @@ -153,17 +210,27 @@ enum ddt_phys_type { /* State flags for dde_flags */ #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ +#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ + +/* + * Additional data to support entry update or repair. This is fixed size + * because its relatively rarely used. + */ +typedef struct { + /* copy of data after a repair read, to be rewritten */ + abd_t *dde_repair_abd; + + /* original phys contents before update, for error handling */ + ddt_univ_phys_t dde_orig_phys; + + /* in-flight update IOs */ + zio_t *dde_lead_zio[DDT_PHYS_MAX]; +} ddt_entry_io_t; typedef struct { /* key must be first for ddt_key_compare */ - ddt_key_t dde_key; /* ddt_tree key */ - ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */ - - /* in-flight update IOs */ - zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - - /* copy of data after a repair read, to be rewritten */ - struct abd *dde_repair_abd; + ddt_key_t dde_key; /* ddt_tree key */ + avl_node_t dde_node; /* ddt_tree_node */ /* storage type and class the entry was loaded from */ ddt_type_t dde_type; @@ -173,9 +240,35 @@ typedef struct { kcondvar_t dde_cv; /* signaled when load completes */ uint64_t dde_waiters; /* count of waiters on dde_cv */ - avl_node_t dde_node; /* ddt_tree node */ + ddt_entry_io_t *dde_io; /* IO support, when required */ + + ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */ } ddt_entry_t; +/* + * A lightweight entry is for short-lived or transient uses, like iterating or + * inspecting, when you don't care where it came from. + */ +typedef struct { + ddt_key_t ddlwe_key; + ddt_type_t ddlwe_type; + ddt_class_t ddlwe_class; + ddt_univ_phys_t ddlwe_phys; +} ddt_lightweight_entry_t; + +/* + * In-core DDT log. A separate struct to make it easier to switch between the + * appending and flushing logs. + */ +typedef struct { + avl_tree_t ddl_tree; /* logged entries */ + uint32_t ddl_flags; /* flags for this log */ + uint64_t ddl_object; /* log object id */ + uint64_t ddl_length; /* on-disk log size */ + uint64_t ddl_first_txg; /* txg log became active */ + ddt_key_t ddl_checkpoint; /* last checkpoint */ +} ddt_log_t; + /* * In-core DDT object. This covers all entries and stats for a the whole pool * for a given checksum type. @@ -184,23 +277,49 @@ typedef struct { kmutex_t ddt_lock; /* protects changes to all fields */ avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ + avl_tree_t ddt_log_tree; /* logged entries */ avl_tree_t ddt_repair_tree; /* entries being repaired */ - enum zio_checksum ddt_checksum; /* checksum algorithm in use */ - spa_t *ddt_spa; /* pool this ddt is on */ - objset_t *ddt_os; /* ddt objset (always MOS) */ + ddt_log_t ddt_log[2]; /* active/flushing logs */ + ddt_log_t *ddt_log_active; /* pointers into ddt_log */ + ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ + + hrtime_t ddt_flush_start; /* log flush start this txg */ + uint32_t ddt_flush_pass; /* log flush pass this txg */ + + int32_t ddt_flush_count; /* entries flushed this txg */ + int32_t ddt_flush_min; /* min rem entries to flush */ + int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ + int32_t ddt_log_flush_rate; /* rolling log flush rate */ + int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ + + uint64_t ddt_flush_force_txg; /* flush hard before this txg */ + + kstat_t *ddt_ksp; /* kstats context */ + + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ + + uint64_t ddt_dir_object; /* MOS dir holding ddt objects */ + uint64_t ddt_version; /* DDT version */ + uint64_t ddt_flags; /* FDT option flags */ /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; - /* object ids for whole-ddt and per-type/per-class stats */ + /* object ids for stored, logged and per-type/per-class stats */ uint64_t ddt_stat_object; + ddt_object_t ddt_log_stats; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + + /* log stats power-2-sized referenced blocks */ + ddt_histogram_t ddt_log_histogram; } ddt_t; /* @@ -215,20 +334,36 @@ typedef struct { uint64_t ddb_cursor; } ddt_bookmark_t; -extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, - uint64_t txg); +extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, - const ddt_phys_t *ddp, blkptr_t *bp); + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp); -extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); -extern void ddt_phys_clear(ddt_phys_t *ddp); -extern void ddt_phys_addref(ddt_phys_t *ddp); -extern void ddt_phys_decref(ddt_phys_t *ddp); -extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + const blkptr_t *bp); +extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v); +extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, + const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted); + +extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe); +extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); -extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh); extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); + extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); extern uint64_t ddt_get_ddt_dsize(spa_t *spa); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); @@ -243,7 +378,7 @@ extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); -extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch_all(spa_t *spa); @@ -251,6 +386,8 @@ extern void ddt_prefetch_all(spa_t *spa); extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); +extern void ddt_alloc_entry_io(ddt_entry_t *dde); + extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); @@ -260,10 +397,17 @@ extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); -extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); + +extern void ddt_walk_init(spa_t *spa, uint64_t txg); +extern boolean_t ddt_walk_ready(spa_t *spa); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, + ddt_lightweight_entry_t *ddlwe); extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); +extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount); + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/ddt_impl.h b/sys/contrib/openzfs/include/sys/ddt_impl.h index 4aaab10c8737..4d3c0cae072e 100644 --- a/sys/contrib/openzfs/include/sys/ddt_impl.h +++ b/sys/contrib/openzfs/include/sys/ddt_impl.h @@ -28,11 +28,132 @@ #define _SYS_DDT_IMPL_H #include +#include #ifdef __cplusplus extern "C" { #endif +/* DDT version numbers */ +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) + +/* Names of interesting objects in the DDT root dir */ +#define DDT_DIR_VERSION "version" +#define DDT_DIR_FLAGS "flags" + +/* Fill a lightweight entry from a live entry. */ +#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (dde)->dde_key; \ + (ddlwe)->ddlwe_type = (dde)->dde_type; \ + (ddlwe)->ddlwe_class = (dde)->dde_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ +} while (0) + +#define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (ddle)->ddle_key; \ + (ddlwe)->ddlwe_type = (ddle)->ddle_type; \ + (ddlwe)->ddlwe_class = (ddle)->ddle_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \ +} while (0) + +/* + * An entry on the log tree. These are "frozen", and a record of what's in + * the on-disk log. They can't be used in place, but can be "loaded" back into + * the live tree. + */ +typedef struct { + ddt_key_t ddle_key; /* ddt_log_tree key */ + avl_node_t ddle_node; /* ddt_log_tree node */ + + ddt_type_t ddle_type; /* storage type */ + ddt_class_t ddle_class; /* storage class */ + + /* extra allocation for flat/trad phys */ + ddt_univ_phys_t ddle_phys[]; +} ddt_log_entry_t; + +/* On-disk log record types. */ +typedef enum { + DLR_INVALID = 0, /* end of block marker */ + DLR_ENTRY = 1, /* an entry to add or replace in the log tree */ +} ddt_log_record_type_t; + +/* On-disk log record header. */ +typedef struct { + /* + * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to + * access it. + * + * bits 0-7: record type (ddt_log_record_type_t) + * bits 8-15: length of record header+payload + * bits 16-47: reserved, all zero + * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type) + * otherwise all zero + * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class) + * otherwise all zero + */ + uint64_t dlr_info; + uint8_t dlr_payload[]; +} ddt_log_record_t; + +#define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8) +#define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v) +#define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16) +#define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v) +#define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8) +#define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v) +#define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8) +#define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v) + +/* Payload for DLR_ENTRY. */ +typedef struct { + ddt_key_t dlre_key; + ddt_univ_phys_t dlre_phys[]; +} ddt_log_record_entry_t; + +/* Log flags (ddl_flags, dlh_flags) */ +#define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */ +#define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */ + +/* On-disk log header, stored in the bonus buffer. */ +typedef struct { + /* + * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to + * access it. + * + * bits 0-7: log version + * bits 8-15: log flags + * bits 16-63: reserved, all zero + */ + uint64_t dlh_info; + + uint64_t dlh_length; /* log size in bytes */ + uint64_t dlh_first_txg; /* txg this log went active */ + ddt_key_t dlh_checkpoint; /* last checkpoint */ +} ddt_log_header_t; + +#define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8) +#define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v) +#define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8) +#define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v) + +/* DDT log update state */ +typedef struct { + dmu_tx_t *dlu_tx; /* tx the update is being applied to */ + dnode_t *dlu_dn; /* log object dnode */ + dmu_buf_t **dlu_dbp; /* array of block buffer pointers */ + int dlu_ndbp; /* number of block buffer pointers */ + uint16_t dlu_reclen; /* cached length of record */ + uint64_t dlu_block; /* block for next entry */ + uint64_t dlu_offset; /* offset for next entry */ +} ddt_log_update_t; + /* * Ops vector to access a specific DDT object type. */ @@ -42,25 +163,53 @@ typedef struct { boolean_t prehash); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_lookup)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + const ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_contains)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, + const ddt_key_t *ddk, const void *phys, size_t psize, dmu_tx_t *tx); int (*ddt_op_remove)(objset_t *os, uint64_t object, const ddt_key_t *ddk, dmu_tx_t *tx); int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, - ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; extern const ddt_ops_t ddt_zap_ops; -extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); +/* Dedup log API */ +extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, + ddt_log_update_t *dlu); +extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde, + ddt_log_update_t *dlu); +extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); + +extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, + ddt_lightweight_entry_t *ddlwe); + +extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe); +extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, + const ddt_key_t *ddk); + +extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, + dmu_tx_t *tx); +extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx); + +extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx); + +extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx); + +extern int ddt_log_load(ddt_t *ddt); +extern void ddt_log_alloc(ddt_t *ddt); +extern void ddt_log_free(ddt_t *ddt); + +extern void ddt_log_init(void); +extern void ddt_log_fini(void); /* * These are only exposed so that zdb can access them. Try not to use them @@ -68,22 +217,59 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); * them up. */ +/* + * We use a histogram to convert a percentage request into a + * cutoff value where entries older than the cutoff get pruned. + * + * The histogram bins represent hours in power-of-two increments. + * 16 bins covers up to four years. + */ +#define HIST_BINS 16 + +typedef struct ddt_age_histo { + uint64_t dah_entries; + uint64_t dah_age_histo[HIST_BINS]; +} ddt_age_histo_t; + +void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram); + +#if defined(_KERNEL) || !defined(ZFS_DEBUG) +#define ddt_dump_age_histogram(histo, cutoff) ((void)0) +#else +static inline void +ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff) +{ + if (histogram->dah_entries == 0) + return; + + (void) printf("DDT prune unique class age, %llu hour cutoff\n", + (u_longlong_t)(gethrestime_sec() - cutoff)/3600); + (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9llu %4d%%\n", 1<dah_age_histo[i], + (int)((histogram->dah_age_histo[i] * 100) / + histogram->dah_entries)); + } +} +#endif + /* * Enough room to expand DMU_POOL_DDT format for all possible DDT * checksum/class/type combinations. */ #define DDT_NAMELEN 32 -extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); +extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, + const ddt_univ_phys_t *ddp); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); -extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); - extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, char *name); extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, - uint64_t *walk, ddt_entry_t *dde); + uint64_t *walk, ddt_lightweight_entry_t *ddlwe); extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, uint64_t *count); extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index 1376cbef763c..928f5f2b4fd4 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -375,7 +375,9 @@ typedef struct dmu_buf { #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_LOG "DDT-log-%s-%u" #define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_DDT_DIR "DDT-%s" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_ERRORSCRUB "error_scrub" diff --git a/sys/contrib/openzfs/include/sys/dsl_scan.h b/sys/contrib/openzfs/include/sys/dsl_scan.h index f32f59a2bedf..63734dbc176f 100644 --- a/sys/contrib/openzfs/include/sys/dsl_scan.h +++ b/sys/contrib/openzfs/include/sys/dsl_scan.h @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx); + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index c7e48d1edc0e..fc4f22cd5304 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -1422,7 +1422,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 88/128 numbers reserved. + * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1519,6 +1519,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ + ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1655,6 +1656,12 @@ typedef enum { ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; +typedef enum { + ZPOOL_DDT_PRUNE_NONE, + ZPOOL_DDT_PRUNE_AGE, /* in seconds */ + ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ +} zpool_ddt_prune_unit_t; + /* * Bookmark name values. */ @@ -1710,6 +1717,11 @@ typedef enum { #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" +/* + * The following are names used when invoking ZFS_IOC_POOL_REGUID. + */ +#define ZPOOL_REGUID_GUID "guid" + /* * The following are names used when invoking ZFS_IOC_POOL_TRIM. */ @@ -1748,6 +1760,12 @@ typedef enum { */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" +/* + * The following are names used when invoking ZFS_IOC_DDT_PRUNE. + */ +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index 0fa3149e6c6f..aa66d489ef1a 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -572,7 +572,7 @@ typedef struct blkptr { #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) -#define BP_ZERO(bp) \ +#define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ @@ -580,6 +580,11 @@ typedef struct blkptr { (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ +} + +#define BP_ZERO(bp) \ +{ \ + BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ @@ -1087,7 +1092,7 @@ extern void spa_strfree(char *); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); -extern int spa_change_guid(spa_t *spa); +extern int spa_change_guid(spa_t *spa, const uint64_t *guidp); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 4fc6f22fcb50..7811abbb9ce3 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -412,6 +412,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + boolean_t spa_active_ddt_prune; /* ddt prune process active */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 446b64ccd8ab..3a756949a422 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -167,6 +167,9 @@ typedef enum zio_suspend_reason { * This was originally an enum type. However, those are 32-bit and there is no * way to make a 64-bit enum type. Since we ran out of bits for flags, we were * forced to upgrade it to a uint64_t. + * + * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * FLAG. */ typedef uint64_t zio_flag_t; /* diff --git a/sys/contrib/openzfs/include/sys/zio_compress.h b/sys/contrib/openzfs/include/sys/zio_compress.h index 691d7b624488..31602039a150 100644 --- a/sys/contrib/openzfs/include/sys/zio_compress.h +++ b/sys/contrib/openzfs/include/sys/zio_compress.h @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2019, Allan Jude - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Use is subject to license terms. * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ @@ -122,25 +122,15 @@ enum zio_zstd_levels { struct zio_prop; /* Common signature for all zio compress functions. */ -typedef size_t zio_compress_func_t(void *src, void *dst, +typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int); /* Common signature for all zio decompress functions. */ -typedef int zio_decompress_func_t(void *src, void *dst, +typedef int zio_decompress_func_t(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int); /* Common signature for all zio decompress and get level functions. */ -typedef int zio_decompresslevel_func_t(void *src, void *dst, +typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *level); -/* Common signature for all zio get-compression-level functions. */ -typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level); - -/* - * Common signature for all zio decompress functions using an ABD as input. - * This is helpful if you have both compressed ARC and scatter ABDs enabled, - * but is not a requirement for all compression algorithms. - */ -typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, - size_t s_len, size_t d_len, int); /* * Information about each compression function. */ @@ -163,34 +153,66 @@ extern void lz4_fini(void); /* * Compression routines. */ -extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, - int level); +extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); +extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len, + size_t d_len, int level); /* * Compress and decompress data if necessary. */ -extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst, +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len, uint8_t level); -extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len, uint8_t *level); -extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, +extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd, size_t s_len, size_t d_len, uint8_t *level); extern int zio_compress_to_feature(enum zio_compress comp); +#define ZFS_COMPRESS_WRAP_DECL(name) \ +size_t \ +name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \ +{ \ + void *s_buf = abd_borrow_buf_copy(src, s_len); \ + void *d_buf = abd_borrow_buf(dst, d_len); \ + size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n); \ + abd_return_buf(src, s_buf, s_len); \ + abd_return_buf_copy(dst, d_buf, d_len); \ + return (c_len); \ +} +#define ZFS_DECOMPRESS_WRAP_DECL(name) \ +int \ +name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n) \ +{ \ + void *s_buf = abd_borrow_buf_copy(src, s_len); \ + void *d_buf = abd_borrow_buf(dst, d_len); \ + int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \ + abd_return_buf(src, s_buf, s_len); \ + abd_return_buf_copy(dst, d_buf, d_len); \ + return (err); \ +} +#define ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name) \ +int \ +name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n) \ +{ \ + void *s_buf = abd_borrow_buf_copy(src, s_len); \ + void *d_buf = abd_borrow_buf(dst, d_len); \ + int err = name##_buf(s_buf, d_buf, s_len, d_len, n); \ + abd_return_buf(src, s_buf, s_len); \ + abd_return_buf_copy(dst, d_buf, d_len); \ + return (err); \ +} + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/zio_impl.h b/sys/contrib/openzfs/include/sys/zio_impl.h index 2b026d48675a..2c846a5d41f6 100644 --- a/sys/contrib/openzfs/include/sys/zio_impl.h +++ b/sys/contrib/openzfs/include/sys/zio_impl.h @@ -120,6 +120,9 @@ extern "C" { /* * zio pipeline stage definitions + * + * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * FLAG. */ enum zio_stage { ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */ diff --git a/sys/contrib/openzfs/include/sys/zio_priority.h b/sys/contrib/openzfs/include/sys/zio_priority.h index 2d8e7fc36bae..bdf5f9b8ff35 100644 --- a/sys/contrib/openzfs/include/sys/zio_priority.h +++ b/sys/contrib/openzfs/include/sys/zio_priority.h @@ -22,6 +22,10 @@ extern "C" { #endif +/* + * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * VALUE. + */ typedef enum zio_priority { ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ diff --git a/sys/contrib/openzfs/include/sys/zstd/zstd.h b/sys/contrib/openzfs/include/sys/zstd/zstd.h index d8c3fa86dce3..6d212b082f9a 100644 --- a/sys/contrib/openzfs/include/sys/zstd/zstd.h +++ b/sys/contrib/openzfs/include/sys/zstd/zstd.h @@ -90,14 +90,12 @@ typedef struct zfs_zstd_meta { int zstd_init(void); void zstd_fini(void); -size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, - size_t d_len, int level); -size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, +size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int level); int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level); -int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, +int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *level); -int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, +int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n); void zfs_zstd_cache_reap_now(void); diff --git a/sys/contrib/openzfs/include/zfeature_common.h b/sys/contrib/openzfs/include/zfeature_common.h index 2515ba321759..5733a8187a95 100644 --- a/sys/contrib/openzfs/include/zfeature_common.h +++ b/sys/contrib/openzfs/include/zfeature_common.h @@ -82,6 +82,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_FAST_DEDUP, SPA_FEATURES } spa_feature_t; diff --git a/sys/contrib/openzfs/include/zfs_valstr.h b/sys/contrib/openzfs/include/zfs_valstr.h new file mode 100644 index 000000000000..77c26ce1ae7d --- /dev/null +++ b/sys/contrib/openzfs/include/zfs_valstr.h @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024, Klara Inc. + */ + +#ifndef _ZFS_VALSTR_H +#define _ZFS_VALSTR_H extern __attribute__((visibility("default"))) + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * These macros create function prototypes for pretty-printing or stringifying + * certain kinds of numeric types. + * + * _ZFS_VALSTR_DECLARE_BITFIELD(name) creates: + * + * size_t zfs_valstr__bits(uint64_t bits, char *out, size_t outlen); + * expands single char for each set bit, and space for each clear bit + * + * size_t zfs_valstr__pairs(uint64_t bits, char *out, size_t outlen); + * expands two-char mnemonic for each bit set in `bits`, separated by `|` + * + * size_t zfs_valstr_(uint64_t bits, char *out, size_t outlen); + * expands full name of each bit set in `bits`, separated by spaces + * + * _ZFS_VALSTR_DECLARE_ENUM(name) creates: + * + * size_t zfs_valstr_(int v, char *out, size_t outlen); + * expands full name of enum value + * + * Each _ZFS_VALSTR_DECLARE_xxx needs a corresponding _VALSTR_xxx_IMPL string + * table in vfs_valstr.c. + */ + +#define _ZFS_VALSTR_DECLARE_BITFIELD(name) \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _bits( \ + uint64_t bits, char *out, size_t outlen); \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _pairs( \ + uint64_t bits, char *out, size_t outlen); \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name( \ + uint64_t bits, char *out, size_t outlen); \ + +#define _ZFS_VALSTR_DECLARE_ENUM(name) \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name( \ + int v, char *out, size_t outlen); \ + +_ZFS_VALSTR_DECLARE_BITFIELD(zio_flag) +_ZFS_VALSTR_DECLARE_BITFIELD(zio_stage) + +_ZFS_VALSTR_DECLARE_ENUM(zio_priority) + +#undef _ZFS_VALSTR_DECLARE_BITFIELD +#undef _ZFS_VALSTR_DECLARE_ENUM + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_VALSTR_H */ diff --git a/sys/contrib/openzfs/lib/libzfs/Makefile.am b/sys/contrib/openzfs/lib/libzfs/Makefile.am index 5e74d908de3d..a976faaf9913 100644 --- a/sys/contrib/openzfs/lib/libzfs/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs/Makefile.am @@ -47,6 +47,7 @@ nodist_libzfs_la_SOURCES = \ module/zcommon/zfs_fletcher_superscalar4.c \ module/zcommon/zfs_namecheck.c \ module/zcommon/zfs_prop.c \ + module/zcommon/zfs_valstr.c \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index 51c8dc9647ee..51b29643ee0c 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -183,8 +183,8 @@ - + @@ -454,6 +454,13 @@ + + + + + + + @@ -466,7 +473,9 @@ + + @@ -485,8 +494,8 @@ - + @@ -529,7 +538,6 @@ - @@ -556,6 +564,7 @@ + @@ -616,7 +625,7 @@ - + @@ -5928,6 +5937,7 @@ + @@ -5962,6 +5972,13 @@ + + + + + + + @@ -6006,7 +6023,8 @@ - + + @@ -6137,6 +6155,12 @@ + + + + + + @@ -6638,6 +6662,11 @@ + + + + + @@ -6791,6 +6820,12 @@ + + + + + + @@ -7830,7 +7865,7 @@ - + @@ -7849,6 +7884,9 @@ + + + @@ -7858,6 +7896,15 @@ + + + + + + + + + @@ -7961,6 +8008,11 @@ + + + + + @@ -8068,6 +8120,11 @@ + + + + + @@ -8086,6 +8143,11 @@ + + + + + @@ -8285,12 +8347,12 @@ - - - + + + @@ -8795,11 +8857,6 @@ - - - - - @@ -9131,8 +9188,8 @@ - - + + @@ -9209,7 +9266,7 @@ - + @@ -9781,6 +9838,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c index 8a043aa0f872..14410b153130 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c @@ -3733,6 +3733,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, (void) zpool_standard_error(hdl, errno, errbuf); } break; + + case ZFS_ERR_ASHIFT_MISMATCH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "The new device cannot have a higher alignment requirement " + "than the top-level vdev.")); + (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); + break; default: (void) zpool_standard_error(hdl, errno, errbuf); } @@ -4303,22 +4310,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) /* * Change the GUID for a pool. + * + * Similar to zpool_reguid(), but may take a GUID. + * + * If the guid argument is NULL, then no GUID is passed in the nvlist to the + * ioctl(). */ int -zpool_reguid(zpool_handle_t *zhp) +zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid) { char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nvl = NULL; zfs_cmd_t zc = {"\0"}; + int error = -1; + + if (guid != NULL) { + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (no_memory(hdl)); + + if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) { + nvlist_free(nvl); + return (no_memory(hdl)); + } + + zcmd_write_src_nvlist(hdl, &zc, nvl); + } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) - return (0); + error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc); + if (error) { + return (zpool_standard_error(hdl, errno, errbuf)); + } + if (guid != NULL) { + zcmd_free_nvlists(&zc); + nvlist_free(nvl); + } + return (0); +} - return (zpool_standard_error(hdl, errno, errbuf)); +/* + * Change the GUID for a pool. + */ +int +zpool_reguid(zpool_handle_t *zhp) +{ + return (zpool_set_guid(zhp, NULL)); } /* @@ -5609,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, return (ret); } + +/* + * Prune older entries from the DDT to reclaim space under the quota + */ +int +zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + int error = lzc_ddt_prune(zhp->zpool_name, unit, amount); + if (error != 0) { + libzfs_handle_t *hdl = zhp->zpool_hdl; + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot prune dedup table on '%s'"), zhp->zpool_name); + + if (error == EALREADY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a prune operation is already in progress")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + return (-1); + } + + return (0); +} diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi index 1062a6b52dff..5ee6b8e09d6d 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi @@ -162,6 +162,7 @@ + @@ -1444,6 +1445,7 @@ + @@ -1484,6 +1486,13 @@ + + + + + + + @@ -3015,6 +3024,12 @@ + + + + + + diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c index ec8b0ff4f61c..d07fca6cebad 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c @@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); } + +/* + * Prune the specified amount from the pool's dedup table. + */ +int +lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit); + fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount); + + error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am index 42f3404db5a9..ff30af7d2b9f 100644 --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -1,7 +1,9 @@ +include $(srcdir)/%D%/include/Makefile.am + libzpool_la_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS) libzpool_la_CFLAGS += $(ZLIB_CFLAGS) -libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) +libzpool_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD @@ -9,6 +11,7 @@ lib_LTLIBRARIES += libzpool.la CPPCHECKTARGETS += libzpool.la dist_libzpool_la_SOURCES = \ + %D%/abd_os.c \ %D%/kernel.c \ %D%/taskq.c \ %D%/util.c @@ -39,7 +42,6 @@ nodist_libzpool_la_SOURCES = \ module/lua/lvm.c \ module/lua/lzio.c \ \ - module/os/linux/zfs/abd_os.c \ module/os/linux/zfs/arc_os.c \ module/os/linux/zfs/trace.c \ module/os/linux/zfs/vdev_file.c \ @@ -62,6 +64,7 @@ nodist_libzpool_la_SOURCES = \ module/zcommon/zfs_fletcher_superscalar4.c \ module/zcommon/zfs_namecheck.c \ module/zcommon/zfs_prop.c \ + module/zcommon/zfs_valstr.c \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c \ \ @@ -79,6 +82,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/dbuf.c \ module/zfs/dbuf_stats.c \ module/zfs/ddt.c \ + module/zfs/ddt_log.c \ module/zfs/ddt_stats.c \ module/zfs/ddt_zap.c \ module/zfs/dmu.c \ diff --git a/sys/contrib/openzfs/lib/libzpool/abd_os.c b/sys/contrib/openzfs/lib/libzpool/abd_os.c new file mode 100644 index 000000000000..5a91605b2fe3 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzpool/abd_os.c @@ -0,0 +1,365 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#include +#include +#include +#include +#include +#include + +/* + * We're simulating scatter/gather with 4K allocations, since that's more like + * what a typical kernel does. + */ +#define ABD_PAGESIZE (4096) +#define ABD_PAGESHIFT (12) +#define ABD_PAGEMASK (ABD_PAGESIZE-1) + +/* + * See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is + * mostly useful to get a mix of linear and scatter ABDs for testing. + */ +#define ABD_SCATTER_MIN_SIZE (512 * 3) + +abd_t *abd_zero_scatter = NULL; + +static uint_t +abd_iovcnt_for_bytes(size_t size) +{ + /* + * Each iovec points to a 4K page. There's no real reason to do this + * in userspace, but our whole point here is to make it feel a bit + * more like a real paged memory model. + */ + return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE); +} + +abd_t * +abd_alloc_struct_impl(size_t size) +{ + /* + * Zero-sized means it will be used for a linear or gang abd, so just + * allocate the abd itself and return. + */ + if (size == 0) + return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL)); + + /* + * Allocating for a scatter abd, so compute how many ABD_PAGESIZE + * iovecs we will need to hold this size. Append that allocation to the + * end. Note that struct abd_scatter has includes abd_iov[1], so we + * allocate one less iovec than we need. + * + * Note we're not allocating the pages proper, just the iovec pointers. + * That's down in abd_alloc_chunks. We _could_ do it here in a single + * allocation, but it's fiddly and harder to read for no real gain. + */ + uint_t n = abd_iovcnt_for_bytes(size); + abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec), + UMEM_NOFAIL); + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_iovcnt = n; + return (abd); +} + +void +abd_free_struct_impl(abd_t *abd) +{ + /* For scatter, compute the extra amount we need to free */ + uint_t iovcnt = + abd_is_linear(abd) || abd_is_gang(abd) ? + 0 : (ABD_SCATTER(abd).abd_iovcnt - 1); + umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec)); +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + /* + * We've already allocated the iovec array; ensure that the wanted size + * actually matches, otherwise the caller has made a mistake somewhere. + */ + uint_t n = ABD_SCATTER(abd).abd_iovcnt; + ASSERT3U(n, ==, abd_iovcnt_for_bytes(size)); + + /* + * Allocate a ABD_PAGESIZE region for each iovec. + */ + struct iovec *iov = ABD_SCATTER(abd).abd_iov; + for (int i = 0; i < n; i++) { + iov[i].iov_base = + umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL); + iov[i].iov_len = ABD_PAGESIZE; + } +} + +void +abd_free_chunks(abd_t *abd) +{ + uint_t n = ABD_SCATTER(abd).abd_iovcnt; + struct iovec *iov = ABD_SCATTER(abd).abd_iov; + for (int i = 0; i < n; i++) + umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size < ABD_SCATTER_MIN_SIZE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size; + if (op == ABDSTAT_INCR) { + arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } else { + arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + (void) abd; + (void) op; + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); +} + +void +abd_verify_scatter(abd_t *abd) +{ +#ifdef ZFS_DEBUG + /* + * scatter abds shall have: + * - at least one iovec + * - all iov_base point somewhere + * - all iov_len are ABD_PAGESIZE + * - offset set within the abd pages somewhere + */ + uint_t n = ABD_SCATTER(abd).abd_iovcnt; + ASSERT3U(n, >, 0); + + uint_t len = 0; + for (int i = 0; i < n; i++) { + ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL); + ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE); + len += ABD_PAGESIZE; + } + + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len); +#endif +} + +void +abd_init(void) +{ + /* + * Create the "zero" scatter abd. This is always the size of the + * largest possible block, but only actually has a single allocated + * page, which all iovecs in the abd point to. + */ + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + + void *zero = + umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL); + memset(zero, 0, ABD_PAGESIZE); + + uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE); + struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov; + for (int i = 0; i < n; i++) { + iov[i].iov_base = zero; + iov[i].iov_len = ABD_PAGESIZE; + } +} + +void +abd_fini(void) +{ + umem_free_aligned( + ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE); + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * LINEAR_PAGE is specific to the Linux kernel; we never set this + * flag, so this will never be called. + */ + (void) abd; + PANIC("unreachable"); +} + +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc(size, is_metadata)); +} + +abd_t * +abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size) +{ + + /* + * Create a new scatter dabd by borrowing data pages from sabd to cover + * off+size. + * + * sabd is an existing scatter abd with a set of iovecs, each covering + * an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset. + * + * [........][........][........][........] + * ^- sabd_offset + * + * We want to produce a new abd, referencing those allocations at the + * given offset. + * + * [........][........][........][........] + * ^- dabd_offset = sabd_offset + off + * ^- dabd_offset + size + * + * In this example, dabd needs three iovecs. The first iovec is offset + * 0, so the final dabd_offset is masked back into the first iovec. + * + * [........][........][........] + * ^- dabd_offset + */ + size_t soff = ABD_SCATTER(sabd).abd_offset + off; + size_t doff = soff & ABD_PAGEMASK; + size_t iovcnt = abd_iovcnt_for_bytes(doff + size); + + /* + * If the passed-in abd has enough allocated iovecs already, reuse it. + * Otherwise, make a new one. The caller will free the original if the + * one it gets back is not the same. + * + * Note that it's ok if we reuse an abd with more iovecs than we need. + * abd_size has the usable amount of data, and the abd does not own the + * pages referenced by the iovecs. At worst, they're holding dangling + * pointers that we'll never use anyway. + */ + if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt) + dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT); + + /* Set offset into first page in view */ + ABD_SCATTER(dabd).abd_offset = doff; + + /* Copy the wanted iovecs from the source to the dest */ + memcpy(&ABD_SCATTER(dabd).abd_iov[0], + &ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT], + iovcnt * sizeof (struct iovec)); + + return (dabd); +} + +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); + aiter->iter_abd = abd; +} + +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); +} + +void +abd_iter_map(struct abd_iter *aiter) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + aiter->iter_mapaddr = + ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + aiter->iter_mapsize = + aiter->iter_abd->abd_size - aiter->iter_pos; + return; + } + + /* + * For scatter, we index into the appropriate iovec, and return the + * smaller of the amount requested, or up to the end of the page. + */ + size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset; + + ASSERT3U(poff >> ABD_PAGESHIFT, <=, + ABD_SCATTER(aiter->iter_abd).abd_iovcnt); + struct iovec *iov = &ABD_SCATTER(aiter->iter_abd). + abd_iov[poff >> ABD_PAGESHIFT]; + + aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK), + aiter->iter_abd->abd_size - aiter->iter_pos); + ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE); + + aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK); +} + +void +abd_iter_unmap(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ +} diff --git a/sys/contrib/openzfs/lib/libzpool/include/Makefile.am b/sys/contrib/openzfs/lib/libzpool/include/Makefile.am new file mode 100644 index 000000000000..2e0c4c5610be --- /dev/null +++ b/sys/contrib/openzfs/lib/libzpool/include/Makefile.am @@ -0,0 +1,4 @@ +libzpooldir = $(includedir)/libzpool +libzpool_HEADERS = \ + %D%/sys/abd_os.h \ + %D%/sys/abd_impl_os.h diff --git a/sys/contrib/openzfs/lib/libzpool/include/sys/abd_impl_os.h b/sys/contrib/openzfs/lib/libzpool/include/sys/abd_impl_os.h new file mode 100644 index 000000000000..3137346f3bb2 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzpool/include/sys/abd_impl_os.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#ifndef _ABD_IMPL_OS_H +#define _ABD_IMPL_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define abd_enter_critical(flags) ((void)0) +#define abd_exit_critical(flags) ((void)0) + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_OS_H */ diff --git a/sys/contrib/openzfs/lib/libzpool/include/sys/abd_os.h b/sys/contrib/openzfs/lib/libzpool/include/sys/abd_os.h new file mode 100644 index 000000000000..8ff6aa1e9e4f --- /dev/null +++ b/sys/contrib/openzfs/lib/libzpool/include/sys/abd_os.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_OS_H +#define _ABD_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd_scatter { + uint_t abd_offset; + uint_t abd_iovcnt; + struct iovec abd_iov[1]; /* actually variable-length */ +}; + +struct abd_linear { + void *abd_buf; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/sys/contrib/openzfs/man/Makefile.am b/sys/contrib/openzfs/man/Makefile.am index 194bb4721619..fde704933764 100644 --- a/sys/contrib/openzfs/man/Makefile.am +++ b/sys/contrib/openzfs/man/Makefile.am @@ -72,6 +72,7 @@ dist_man_MANS = \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ + %D%/man8/zpool-ddtprune.8 \ %D%/man8/zpool-events.8 \ %D%/man8/zpool-export.8 \ %D%/man8/zpool-get.8 \ diff --git a/sys/contrib/openzfs/man/man4/spl.4 b/sys/contrib/openzfs/man/man4/spl.4 index 5cc12764e18c..22832c492db8 100644 --- a/sys/contrib/openzfs/man/man4/spl.4 +++ b/sys/contrib/openzfs/man/man4/spl.4 @@ -175,17 +175,6 @@ Increasing this value will result in a slower thread creation rate which may be preferable for some configurations. . -.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint -The maximum number of tasks per pending list in each taskq shown in -.Pa /proc/spl/taskq{,-all} . -Write -.Sy 0 -to turn off the limit. -The proc file will walk the lists with lock held, -reading it could cause a lock-up if the list grow too large -without limiting the output. -"(truncated)" will be shown if the list is larger than the limit. -. .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint Minimum idle threads exit interval for dynamic taskqs. Smaller values allow idle threads exit more often and potentially be diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 45b6c338aa9e..20bb95c1aeea 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory. dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. . +.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint +Controls the number of copies stored for DeDup Table +.Pq DDT +objects. +Reducing the number of copies to 1 from the previous default of 3 +can reduce the write inflation caused by deduplication. +This assumes redundancy for this data is provided by the vdev layer. +If the DDT is damaged, space may be leaked +.Pq not freed +when the DDT can not report the correct reference count. +. .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Limit the amount we can prefetch with one call to this amount in bytes. This helps to limit the amount of memory that can be used by prefetching. @@ -121,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching into L2ARC. If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. . -.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int +.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large amounts of data that are not expected to be accessed more than once. .Pp -The default is off, +The default is 0, meaning both MRU and MFU data and metadata are cached. -When turning off this feature, some MRU buffers will still be present -in ARC and eventually cached on L2ARC. +When turning off this feature (setting it to 0), some MRU buffers will +still be present in ARC and eventually cached on L2ARC. .No If Sy l2arc_noprefetch Ns = Ns Sy 0 , some prefetched buffers will be cached to L2ARC, and those might later transition to MRU, in which case the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp +Setting it to 1 means to L2 cache only MFU data and metadata. +.Pp +Setting it to 2 means to L2 cache all metadata (MRU+MFU) but +only MFU data (ie: MRU data are not cached). This can be the right setting +to cache as much metadata as possible even when having high data turnover. +.Pp Regardless of .Sy l2arc_noprefetch , some MFU buffers might be evicted from ARC, @@ -821,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for eviction in response to one page allocation attempt. Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. +To reduce OOM risk, this limit is applied for kswapd reclaims only. .Pp The default limit of .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages @@ -974,6 +992,88 @@ milliseconds until the operation completes. .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . +.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint +Maximum number of dedup log flush passes (iterations) each transaction. +.Pp +At the start of each transaction, OpenZFS will estimate how many entries it +needs to flush out to keep up with the change rate, taking the amount and time +taken to flush on previous txgs into account (see +.Sy zfs_dedup_log_flush_flow_rate_txgs ) . +It will spread this amount into a number of passes. +At each pass, it will use the amount already flushed and the total time taken +by flushing and by other IO to recompute how much it should do for the remainder +of the txg. +.Pp +Reducing the max number of passes will make flushing more aggressive, flushing +out more entries on each pass. +This can be faster, but also more likely to compete with other IO. +Increasing the max number of passes will put fewer entries onto each pass, +keeping the overhead of dedup changes to a minimum but possibly causing a large +number of changes to be dumped on the last pass, which can blow out the txg +sync time beyond +.Sy zfs_txg_timeout . +. +.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint +Minimum time to spend on dedup log flush each transaction. +.Pp +At least this long will be spent flushing dedup log entries each transaction, +up to +.Sy zfs_txg_timeout . +This occurs even if doing so would delay the transaction, that is, other IO +completes under this time. +. +.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint +Flush at least this many entries each transaction. +.Pp +OpenZFS will estimate how many entries it needs to flush each transaction to +keep up with the ingest rate (see +.Sy zfs_dedup_log_flush_flow_rate_txgs ) . +This sets the minimum for that estimate. +Raising it can force OpenZFS to flush more aggressively, keeping the log small +and so reducing pool import times, but can make it less able to back off if +log flushing would compete with other IO too much. +. +.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint +Number of transactions to use to compute the flow rate. +.Pp +OpenZFS will estimate how many entries it needs to flush each transaction by +monitoring the number of entries changed (ingest rate), number of entries +flushed (flush rate) and time spent flushing (flush time rate) and combining +these into an overall "flow rate". +It will use an exponential weighted moving average over some number of recent +transactions to compute these rates. +This sets the number of transactions to compute these averages over. +Setting it higher can help to smooth out the flow rate in the face of spiky +workloads, but will take longer for the flow rate to adjust to a sustained +change in the ingress rate. +. +.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint +Max transactions to before starting to flush dedup logs. +.Pp +OpenZFS maintains two dedup logs, one receiving new changes, one flushing. +If there is nothing to flush, it will accumulate changes for no more than this +many transactions before switching the logs and starting to flush entries out. +. +.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64 +Max memory to use for dedup logs. +.Pp +OpenZFS will spend no more than this much memory on maintaining the in-memory +dedup log. +Flushing will begin when around half this amount is being spent on logs. +The default value of +.Sy 0 +will cause it to be set by +.Sy zfs_dedup_log_mem_max_percent +instead. +. +.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint +Max memory to use for dedup logs, as a percentage of total memory. +.Pp +If +.Sy zfs_dedup_log_mem_max +is not set, it will be initialised as a percentage of the total memory in the +system. +. .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7 index ea3c68dc6083..ff6e485a4819 100644 --- a/sys/contrib/openzfs/man/man7/zpool-features.7 +++ b/sys/contrib/openzfs/man/man7/zpool-features.7 @@ -17,8 +17,9 @@ .\" Copyright (c) 2019, Klara Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley +.\" Copyright (c) 2023, Klara Inc. .\" -.Dd June 23, 2022 +.Dd February 14, 2024 .Dt ZPOOL-FEATURES 7 .Os . @@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . +.feature com.klarasystems fast_dedup yes +This feature allows more advanced deduplication features to be enabled on new +dedup tables. +.Pp +This feature will be +.Sy active +when the first deduplicated block is written after a new dedup table is created +(ie after a new pool creation, or new checksum used on a dataset with +.Sy dedup +enabled). +It will be returned to the +.Sy enabled +state when all deduplicated blocks using it are freed. +. .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. diff --git a/sys/contrib/openzfs/man/man8/zpool-ddtprune.8 b/sys/contrib/openzfs/man/man8/zpool-ddtprune.8 new file mode 100644 index 000000000000..1ab7d3982c3e --- /dev/null +++ b/sys/contrib/openzfs/man/man8/zpool-ddtprune.8 @@ -0,0 +1,48 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2024, Klara Inc. +.\" +.Dd June 17, 2024 +.Dt ZPOOL-DDTPRUNE 8 +.Os +. +.Sh NAME +.Nm zpool-ddtprune +.Nd Prunes the oldest entries from the single reference dedup table(s) +.Sh SYNOPSIS +.Nm zpool +.Cm ddtprune +.Fl d Ar days | Fl p Ar percentage +.Ar pool +.Sh DESCRIPTION +This command prunes older unique entries from the dedup table. +As a complement to the dedup quota feature, +.Sy ddtprune +allows removal of older non-duplicate entries to make room for +newer duplicate entries. +.Pp +The amount to prune can be based on a target percentage of the unique entries +or based on the age (i.e., every unique entry older than N days). +. +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zpool-status 8 diff --git a/sys/contrib/openzfs/man/man8/zpool-reguid.8 b/sys/contrib/openzfs/man/man8/zpool-reguid.8 index 1fd4ddd9a77d..4fda3f316e3b 100644 --- a/sys/contrib/openzfs/man/man8/zpool-reguid.8 +++ b/sys/contrib/openzfs/man/man8/zpool-reguid.8 @@ -25,8 +25,10 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara Inc. +.\" Copyright (c) 2024, Mateusz Piotrowski .\" -.Dd May 31, 2021 +.Dd June 21, 2023 .Dt ZPOOL-REGUID 8 .Os . @@ -36,6 +38,7 @@ .Sh SYNOPSIS .Nm zpool .Cm reguid +.Op Fl g Ar guid .Ar pool . .Sh DESCRIPTION @@ -43,6 +46,15 @@ Generates a new unique identifier for the pool. You must ensure that all devices in this pool are online and healthy before performing this action. . +.Bl -tag -width Ds +.It Fl g Ar guid +Set the pool GUID to the provided value. +The GUID can be any 64-bit value accepted by +.Xr strtoull 3 +in base 10. +.Nm +will return an error if the provided GUID is already in use. +.El .Sh SEE ALSO .Xr zpool-export 8 , .Xr zpool-import 8 diff --git a/sys/contrib/openzfs/man/man8/zpool.8 b/sys/contrib/openzfs/man/man8/zpool.8 index c55644d9ecea..02a258f66708 100644 --- a/sys/contrib/openzfs/man/man8/zpool.8 +++ b/sys/contrib/openzfs/man/man8/zpool.8 @@ -592,6 +592,7 @@ don't wait. .Xr zpool-checkpoint 8 , .Xr zpool-clear 8 , .Xr zpool-create 8 , +.Xr zpool-ddtprune 8 , .Xr zpool-destroy 8 , .Xr zpool-detach 8 , .Xr zpool-events 8 , diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 4f48cb9da0c1..0472a9348c13 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -16,8 +16,8 @@ src = @abs_srcdir@ obj = @abs_builddir@ else zfs_include = $(srctree)/include/zfs -icp_include = $(srctree)/$(src)/icp/include -zstd_include = $(srctree)/$(src)/zstd/include +icp_include = $(src)/icp/include +zstd_include = $(src)/zstd/include ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h endif @@ -240,6 +240,7 @@ ZCOMMON_OBJS := \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ + zfs_valstr.o \ zpool_prop.o \ zprop_common.o @@ -322,6 +323,7 @@ ZFS_OBJS := \ dbuf.o \ dbuf_stats.o \ ddt.o \ + ddt_log.o \ ddt_stats.o \ ddt_zap.o \ dmu.o \ diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd index d9d31564d090..9161204c99d3 100644 --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -233,6 +233,7 @@ SRCS+= cityhash.c \ zfs_fletcher_superscalar.c \ zfs_namecheck.c \ zfs_prop.c \ + zfs_valstr.c \ zpool_prop.c \ zprop_common.c @@ -252,6 +253,7 @@ SRCS+= abd.c \ dbuf.c \ dbuf_stats.c \ ddt.c \ + ddt_log.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ @@ -426,6 +428,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast CFLAGS.abd.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.ddt_zap.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index fb5c46ecf7c2..f24ea3dc7685 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -95,14 +95,12 @@ struct { */ static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1; -#if defined(_KERNEL) SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN, &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations."); -#endif kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; @@ -250,7 +248,7 @@ abd_alloc_zero_scatter(void) n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); - abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; ABD_SCATTER(abd_zero_scatter).abd_offset = 0; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 30983b13f7d1..c84cb7407a9c 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -124,7 +124,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); -SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index 986db1518456..6ee0236d289a 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -868,16 +868,16 @@ spl_init(void) if ((rc = spl_tsd_init())) goto out2; - if ((rc = spl_taskq_init())) + if ((rc = spl_proc_init())) goto out3; - if ((rc = spl_kmem_cache_init())) + if ((rc = spl_kstat_init())) goto out4; - if ((rc = spl_proc_init())) + if ((rc = spl_taskq_init())) goto out5; - if ((rc = spl_kstat_init())) + if ((rc = spl_kmem_cache_init())) goto out6; if ((rc = spl_zlib_init())) @@ -891,13 +891,13 @@ spl_init(void) out8: spl_zlib_fini(); out7: - spl_kstat_fini(); -out6: - spl_proc_fini(); -out5: spl_kmem_cache_fini(); -out4: +out6: spl_taskq_fini(); +out5: + spl_kstat_fini(); +out4: + spl_proc_fini(); out3: spl_tsd_fini(); out2: @@ -913,10 +913,10 @@ spl_fini(void) { spl_zone_fini(); spl_zlib_fini(); - spl_kstat_fini(); - spl_proc_fini(); spl_kmem_cache_fini(); spl_taskq_fini(); + spl_kstat_fini(); + spl_proc_fini(); spl_tsd_fini(); spl_kvmem_fini(); spl_random_fini(); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c index 2c0cdd9febf5..9fefcd03c410 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -63,8 +62,6 @@ static struct ctl_table_header *spl_kstat = NULL; static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL; -static struct proc_dir_entry *proc_spl_taskq_all = NULL; -static struct proc_dir_entry *proc_spl_taskq = NULL; struct proc_dir_entry *proc_spl_kstat = NULL; #ifdef DEBUG_KMEM @@ -177,195 +174,6 @@ proc_dohostid(CONST_CTL_TABLE *table, int write, return (0); } -static void -taskq_seq_show_headers(struct seq_file *f) -{ - seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", - "taskq", "act", "nthr", "spwn", "maxt", "pri", - "mina", "maxa", "cura", "flags"); -} - -/* indices into the lheads array below */ -#define LHEAD_PEND 0 -#define LHEAD_PRIO 1 -#define LHEAD_DELAY 2 -#define LHEAD_WAIT 3 -#define LHEAD_ACTIVE 4 -#define LHEAD_SIZE 5 - -static unsigned int spl_max_show_tasks = 512; -/* CSTYLED */ -module_param(spl_max_show_tasks, uint, 0644); -MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); - -static int -taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) -{ - taskq_t *tq = p; - taskq_thread_t *tqt = NULL; - spl_wait_queue_entry_t *wq; - struct task_struct *tsk; - taskq_ent_t *tqe; - char name[100]; - struct list_head *lheads[LHEAD_SIZE], *lh; - static char *list_names[LHEAD_SIZE] = - {"pend", "prio", "delay", "wait", "active" }; - int i, j, have_lheads = 0; - unsigned long wflags, flags; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); - - /* get the various lists and check whether they're empty */ - lheads[LHEAD_PEND] = &tq->tq_pend_list; - lheads[LHEAD_PRIO] = &tq->tq_prio_list; - lheads[LHEAD_DELAY] = &tq->tq_delay_list; -#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY - lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; -#else - lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; -#endif - lheads[LHEAD_ACTIVE] = &tq->tq_active_list; - - for (i = 0; i < LHEAD_SIZE; ++i) { - if (list_empty(lheads[i])) - lheads[i] = NULL; - else - ++have_lheads; - } - - /* early return in non-"all" mode if lists are all empty */ - if (!allflag && !have_lheads) { - spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); - spin_unlock_irqrestore(&tq->tq_lock, flags); - return (0); - } - - /* unlock the waitq quickly */ - if (!lheads[LHEAD_WAIT]) - spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); - - /* show the base taskq contents */ - snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); - seq_printf(f, "%-25s ", name); - seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", - tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, - tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, - tq->tq_nalloc, tq->tq_flags); - - /* show the active list */ - if (lheads[LHEAD_ACTIVE]) { - j = 0; - list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { - if (j == 0) - seq_printf(f, "\t%s:", - list_names[LHEAD_ACTIVE]); - else if (j == 2) { - seq_printf(f, "\n\t "); - j = 0; - } - seq_printf(f, " [%d]%pf(%ps)", - tqt->tqt_thread->pid, - tqt->tqt_task->tqent_func, - tqt->tqt_task->tqent_arg); - ++j; - } - seq_printf(f, "\n"); - } - - for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) - if (lheads[i]) { - j = 0; - list_for_each(lh, lheads[i]) { - if (spl_max_show_tasks != 0 && - j >= spl_max_show_tasks) { - seq_printf(f, "\n\t(truncated)"); - break; - } - /* show the wait waitq list */ - if (i == LHEAD_WAIT) { -#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY - wq = list_entry(lh, - spl_wait_queue_entry_t, entry); -#else - wq = list_entry(lh, - spl_wait_queue_entry_t, task_list); -#endif - if (j == 0) - seq_printf(f, "\t%s:", - list_names[i]); - else if (j % 8 == 0) - seq_printf(f, "\n\t "); - - tsk = wq->private; - seq_printf(f, " %d", tsk->pid); - /* pend, prio and delay lists */ - } else { - tqe = list_entry(lh, taskq_ent_t, - tqent_list); - if (j == 0) - seq_printf(f, "\t%s:", - list_names[i]); - else if (j % 2 == 0) - seq_printf(f, "\n\t "); - - seq_printf(f, " %pf(%ps)", - tqe->tqent_func, - tqe->tqent_arg); - } - ++j; - } - seq_printf(f, "\n"); - } - if (lheads[LHEAD_WAIT]) - spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - return (0); -} - -static int -taskq_all_seq_show(struct seq_file *f, void *p) -{ - return (taskq_seq_show_impl(f, p, B_TRUE)); -} - -static int -taskq_seq_show(struct seq_file *f, void *p) -{ - return (taskq_seq_show_impl(f, p, B_FALSE)); -} - -static void * -taskq_seq_start(struct seq_file *f, loff_t *pos) -{ - struct list_head *p; - loff_t n = *pos; - - down_read(&tq_list_sem); - if (!n) - taskq_seq_show_headers(f); - - p = tq_list.next; - while (n--) { - p = p->next; - if (p == &tq_list) - return (NULL); - } - - return (list_entry(p, taskq_t, tq_taskqs)); -} - -static void * -taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) -{ - taskq_t *tq = p; - - ++*pos; - return ((tq->tq_taskqs.next == &tq_list) ? - NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); -} - static void slab_seq_show_headers(struct seq_file *f) { @@ -501,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = { #endif }; -static void -taskq_seq_stop(struct seq_file *f, void *v) -{ - up_read(&tq_list_sem); -} - -static const struct seq_operations taskq_all_seq_ops = { - .show = taskq_all_seq_show, - .start = taskq_seq_start, - .next = taskq_seq_next, - .stop = taskq_seq_stop, -}; - -static const struct seq_operations taskq_seq_ops = { - .show = taskq_seq_show, - .start = taskq_seq_start, - .next = taskq_seq_next, - .stop = taskq_seq_stop, -}; - -static int -proc_taskq_all_open(struct inode *inode, struct file *filp) -{ - return (seq_open(filp, &taskq_all_seq_ops)); -} - -static int -proc_taskq_open(struct inode *inode, struct file *filp) -{ - return (seq_open(filp, &taskq_seq_ops)); -} - -static const kstat_proc_op_t proc_taskq_all_operations = { -#ifdef HAVE_PROC_OPS_STRUCT - .proc_open = proc_taskq_all_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -#else - .open = proc_taskq_all_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -#endif -}; - -static const kstat_proc_op_t proc_taskq_operations = { -#ifdef HAVE_PROC_OPS_STRUCT - .proc_open = proc_taskq_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -#else - .open = proc_taskq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -#endif -}; - static struct ctl_table spl_kmem_table[] = { #ifdef DEBUG_KMEM { @@ -677,8 +425,6 @@ static void spl_proc_cleanup(void) remove_proc_entry("kstat", proc_spl); remove_proc_entry("slab", proc_spl_kmem); remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); remove_proc_entry("spl", NULL); #ifndef HAVE_REGISTER_SYSCTL_TABLE @@ -761,20 +507,6 @@ spl_proc_init(void) goto out; } - proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, - &proc_taskq_all_operations, NULL); - if (proc_spl_taskq_all == NULL) { - rc = -EUNATCH; - goto out; - } - - proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, - &proc_taskq_operations, NULL); - if (proc_spl_taskq == NULL) { - rc = -EUNATCH; - goto out; - } - proc_spl_kmem = proc_mkdir("kmem", proc_spl); if (proc_spl_kmem == NULL) { rc = -EUNATCH; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index e7b812c3b5b5..c16bc9bc6409 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -22,16 +22,98 @@ * * Solaris Porting Layer (SPL) Task Queue Implementation. */ +/* + * Copyright (c) 2024, Klara Inc. + * Copyright (c) 2024, Syneto + */ #include #include #include #include #include +#include +#include +#include #ifdef HAVE_CPU_HOTPLUG #include #endif +typedef struct taskq_kstats { + /* static values, for completeness */ + kstat_named_t tqks_threads_max; + kstat_named_t tqks_entry_pool_min; + kstat_named_t tqks_entry_pool_max; + + /* gauges (inc/dec counters, current value) */ + kstat_named_t tqks_threads_active; + kstat_named_t tqks_threads_idle; + kstat_named_t tqks_threads_total; + kstat_named_t tqks_tasks_pending; + kstat_named_t tqks_tasks_priority; + kstat_named_t tqks_tasks_total; + kstat_named_t tqks_tasks_delayed; + kstat_named_t tqks_entries_free; + + /* counters (inc only, since taskq creation) */ + kstat_named_t tqks_threads_created; + kstat_named_t tqks_threads_destroyed; + kstat_named_t tqks_tasks_dispatched; + kstat_named_t tqks_tasks_dispatched_delayed; + kstat_named_t tqks_tasks_executed_normal; + kstat_named_t tqks_tasks_executed_priority; + kstat_named_t tqks_tasks_executed; + kstat_named_t tqks_tasks_delayed_requeued; + kstat_named_t tqks_tasks_cancelled; + kstat_named_t tqks_thread_wakeups; + kstat_named_t tqks_thread_wakeups_nowork; + kstat_named_t tqks_thread_sleeps; +} taskq_kstats_t; + +static taskq_kstats_t taskq_kstats_template = { + { "threads_max", KSTAT_DATA_UINT64 }, + { "entry_pool_min", KSTAT_DATA_UINT64 }, + { "entry_pool_max", KSTAT_DATA_UINT64 }, + { "threads_active", KSTAT_DATA_UINT64 }, + { "threads_idle", KSTAT_DATA_UINT64 }, + { "threads_total", KSTAT_DATA_UINT64 }, + { "tasks_pending", KSTAT_DATA_UINT64 }, + { "tasks_priority", KSTAT_DATA_UINT64 }, + { "tasks_total", KSTAT_DATA_UINT64 }, + { "tasks_delayed", KSTAT_DATA_UINT64 }, + { "entries_free", KSTAT_DATA_UINT64 }, + + { "threads_created", KSTAT_DATA_UINT64 }, + { "threads_destroyed", KSTAT_DATA_UINT64 }, + { "tasks_dispatched", KSTAT_DATA_UINT64 }, + { "tasks_dispatched_delayed", KSTAT_DATA_UINT64 }, + { "tasks_executed_normal", KSTAT_DATA_UINT64 }, + { "tasks_executed_priority", KSTAT_DATA_UINT64 }, + { "tasks_executed", KSTAT_DATA_UINT64 }, + { "tasks_delayed_requeued", KSTAT_DATA_UINT64 }, + { "tasks_cancelled", KSTAT_DATA_UINT64 }, + { "thread_wakeups", KSTAT_DATA_UINT64 }, + { "thread_wakeups_nowork", KSTAT_DATA_UINT64 }, + { "thread_sleeps", KSTAT_DATA_UINT64 }, +}; + +#define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1) +#define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1) + +#define _TQSTAT_MOD_LIST(mod, tq, t) do { \ + switch (t->tqent_flags & TQENT_LIST_MASK) { \ + case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\ + case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \ + case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \ + case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \ + } \ +} while (0) +#define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t) +#define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t) + +#define TQENT_SET_LIST(t, l) \ + t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l; + static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); @@ -134,6 +216,7 @@ task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) ASSERT(!timer_pending(&t->tqent_timer)); list_del_init(&t->tqent_list); + TQSTAT_DEC(tq, entries_free); return (t); } @@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t) { ASSERT(tq); ASSERT(t); + ASSERT(list_empty(&t->tqent_list)); /* Wake tasks blocked in taskq_wait_id() */ wake_up_all(&t->tqent_waitq); - list_del_init(&t->tqent_list); - if (tq->tq_nalloc <= tq->tq_minalloc) { t->tqent_id = TASKQID_INVALID; t->tqent_func = NULL; @@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t) t->tqent_flags = 0; list_add_tail(&t->tqent_list, &tq->tq_free_list); + TQSTAT_INC(tq, entries_free); } else { task_free(tq, t); } @@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t) spin_unlock_irqrestore(&tq->tq_lock, flags); wake_up(&tq->tq_work_waitq); + + TQSTAT_INC(tq, tasks_delayed_requeued); } static void @@ -534,7 +619,11 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id) t = taskq_find(tq, id); if (t && t != ERR_PTR(-EBUSY)) { list_del_init(&t->tqent_list); + TQSTAT_DEC_LIST(tq, t); + TQSTAT_DEC(tq, tasks_total); + t->tqent_flags |= TQENT_FLAG_CANCEL; + TQSTAT_INC(tq, tasks_cancelled); /* * When canceling the lowest outstanding task id we @@ -604,13 +693,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) spin_lock(&t->tqent_lock); /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ - if (flags & TQ_NOQUEUE) + if (flags & TQ_NOQUEUE) { + TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add(&t->tqent_list, &tq->tq_prio_list); /* Queue to the priority list instead of the pending list */ - else if (flags & TQ_FRONT) + } else if (flags & TQ_FRONT) { + TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add_tail(&t->tqent_list, &tq->tq_prio_list); - else + } else { + TQENT_SET_LIST(t, TQENT_LIST_PENDING); list_add_tail(&t->tqent_list, &tq->tq_pend_list); + } + TQSTAT_INC_LIST(tq, t); + TQSTAT_INC(tq, tasks_total); t->tqent_id = rc = tq->tq_next_id; tq->tq_next_id++; @@ -629,6 +724,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) wake_up(&tq->tq_work_waitq); + TQSTAT_INC(tq, tasks_dispatched); + /* Spawn additional taskq threads if required. */ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); @@ -662,6 +759,9 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, /* Queue to the delay list for subsequent execution */ list_add_tail(&t->tqent_list, &tq->tq_delay_list); + TQENT_SET_LIST(t, TQENT_LIST_DELAY); + TQSTAT_INC_LIST(tq, t); + TQSTAT_INC(tq, tasks_total); t->tqent_id = rc = tq->tq_next_id; tq->tq_next_id++; @@ -676,6 +776,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, spin_unlock(&t->tqent_lock); + TQSTAT_INC(tq, tasks_dispatched_delayed); + /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); @@ -724,10 +826,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, t->tqent_flags |= TQENT_FLAG_PREALLOC; /* Queue to the priority list instead of the pending list */ - if (flags & TQ_FRONT) + if (flags & TQ_FRONT) { + TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add_tail(&t->tqent_list, &tq->tq_prio_list); - else + } else { + TQENT_SET_LIST(t, TQENT_LIST_PENDING); list_add_tail(&t->tqent_list, &tq->tq_pend_list); + } + TQSTAT_INC_LIST(tq, t); + TQSTAT_INC(tq, tasks_total); t->tqent_id = tq->tq_next_id; tq->tq_next_id++; @@ -742,6 +849,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, wake_up(&tq->tq_work_waitq); + TQSTAT_INC(tq, tasks_dispatched); + /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); @@ -908,6 +1017,8 @@ taskq_thread(void *args) wake_up(&tq->tq_wait_waitq); set_current_state(TASK_INTERRUPTIBLE); + TQSTAT_INC(tq, threads_total); + while (!kthread_should_stop()) { if (list_empty(&tq->tq_pend_list) && @@ -919,9 +1030,15 @@ taskq_thread(void *args) add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); spin_unlock_irqrestore(&tq->tq_lock, flags); + TQSTAT_INC(tq, thread_sleeps); + TQSTAT_INC(tq, threads_idle); + schedule(); seq_tasks = 0; + TQSTAT_DEC(tq, threads_idle); + TQSTAT_INC(tq, thread_wakeups); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); remove_wait_queue(&tq->tq_work_waitq, &wait); @@ -931,6 +1048,8 @@ taskq_thread(void *args) if ((t = taskq_next_ent(tq)) != NULL) { list_del_init(&t->tqent_list); + TQSTAT_DEC_LIST(tq, t); + TQSTAT_DEC(tq, tasks_total); /* * A TQENT_FLAG_PREALLOC task may be reused or freed @@ -955,6 +1074,7 @@ taskq_thread(void *args) tq->tq_nactive++; spin_unlock_irqrestore(&tq->tq_lock, flags); + TQSTAT_INC(tq, threads_active); DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); /* Perform the requested task */ @@ -962,8 +1082,17 @@ taskq_thread(void *args) DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); + TQSTAT_DEC(tq, threads_active); + if ((t->tqent_flags & TQENT_LIST_MASK) == + TQENT_LIST_PENDING) + TQSTAT_INC(tq, tasks_executed_normal); + else + TQSTAT_INC(tq, tasks_executed_priority); + TQSTAT_INC(tq, tasks_executed); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + tq->tq_nactive--; list_del_init(&tqt->tqt_active_list); tqt->tqt_task = NULL; @@ -989,7 +1118,8 @@ taskq_thread(void *args) tqt->tqt_id = TASKQID_INVALID; tqt->tqt_flags = 0; wake_up_all(&tq->tq_wait_waitq); - } + } else + TQSTAT_INC(tq, thread_wakeups_nowork); set_current_state(TASK_INTERRUPTIBLE); @@ -998,6 +1128,10 @@ taskq_thread(void *args) __set_current_state(TASK_RUNNING); tq->tq_nthreads--; list_del_init(&tqt->tqt_thread_list); + + TQSTAT_DEC(tq, threads_total); + TQSTAT_INC(tq, threads_destroyed); + error: kmem_free(tqt, sizeof (taskq_thread_t)); spin_unlock_irqrestore(&tq->tq_lock, flags); @@ -1037,9 +1171,156 @@ taskq_thread_create(taskq_t *tq) wake_up_process(tqt->tqt_thread); + TQSTAT_INC(tq, threads_created); + return (tqt); } +static void +taskq_stats_init(taskq_t *tq) +{ + taskq_sums_t *tqs = &tq->tq_sums; + wmsum_init(&tqs->tqs_threads_active, 0); + wmsum_init(&tqs->tqs_threads_idle, 0); + wmsum_init(&tqs->tqs_threads_total, 0); + wmsum_init(&tqs->tqs_tasks_pending, 0); + wmsum_init(&tqs->tqs_tasks_priority, 0); + wmsum_init(&tqs->tqs_tasks_total, 0); + wmsum_init(&tqs->tqs_tasks_delayed, 0); + wmsum_init(&tqs->tqs_entries_free, 0); + wmsum_init(&tqs->tqs_threads_created, 0); + wmsum_init(&tqs->tqs_threads_destroyed, 0); + wmsum_init(&tqs->tqs_tasks_dispatched, 0); + wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0); + wmsum_init(&tqs->tqs_tasks_executed_normal, 0); + wmsum_init(&tqs->tqs_tasks_executed_priority, 0); + wmsum_init(&tqs->tqs_tasks_executed, 0); + wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0); + wmsum_init(&tqs->tqs_tasks_cancelled, 0); + wmsum_init(&tqs->tqs_thread_wakeups, 0); + wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0); + wmsum_init(&tqs->tqs_thread_sleeps, 0); +} + +static void +taskq_stats_fini(taskq_t *tq) +{ + taskq_sums_t *tqs = &tq->tq_sums; + wmsum_fini(&tqs->tqs_threads_active); + wmsum_fini(&tqs->tqs_threads_idle); + wmsum_fini(&tqs->tqs_threads_total); + wmsum_fini(&tqs->tqs_tasks_pending); + wmsum_fini(&tqs->tqs_tasks_priority); + wmsum_fini(&tqs->tqs_tasks_total); + wmsum_fini(&tqs->tqs_tasks_delayed); + wmsum_fini(&tqs->tqs_entries_free); + wmsum_fini(&tqs->tqs_threads_created); + wmsum_fini(&tqs->tqs_threads_destroyed); + wmsum_fini(&tqs->tqs_tasks_dispatched); + wmsum_fini(&tqs->tqs_tasks_dispatched_delayed); + wmsum_fini(&tqs->tqs_tasks_executed_normal); + wmsum_fini(&tqs->tqs_tasks_executed_priority); + wmsum_fini(&tqs->tqs_tasks_executed); + wmsum_fini(&tqs->tqs_tasks_delayed_requeued); + wmsum_fini(&tqs->tqs_tasks_cancelled); + wmsum_fini(&tqs->tqs_thread_wakeups); + wmsum_fini(&tqs->tqs_thread_wakeups_nowork); + wmsum_fini(&tqs->tqs_thread_sleeps); +} + +static int +taskq_kstats_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + return (EACCES); + + taskq_t *tq = ksp->ks_private; + taskq_kstats_t *tqks = ksp->ks_data; + + tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads; + tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc; + tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc; + + taskq_sums_t *tqs = &tq->tq_sums; + + tqks->tqks_threads_active.value.ui64 = + wmsum_value(&tqs->tqs_threads_active); + tqks->tqks_threads_idle.value.ui64 = + wmsum_value(&tqs->tqs_threads_idle); + tqks->tqks_threads_total.value.ui64 = + wmsum_value(&tqs->tqs_threads_total); + tqks->tqks_tasks_pending.value.ui64 = + wmsum_value(&tqs->tqs_tasks_pending); + tqks->tqks_tasks_priority.value.ui64 = + wmsum_value(&tqs->tqs_tasks_priority); + tqks->tqks_tasks_total.value.ui64 = + wmsum_value(&tqs->tqs_tasks_total); + tqks->tqks_tasks_delayed.value.ui64 = + wmsum_value(&tqs->tqs_tasks_delayed); + tqks->tqks_entries_free.value.ui64 = + wmsum_value(&tqs->tqs_entries_free); + tqks->tqks_threads_created.value.ui64 = + wmsum_value(&tqs->tqs_threads_created); + tqks->tqks_threads_destroyed.value.ui64 = + wmsum_value(&tqs->tqs_threads_destroyed); + tqks->tqks_tasks_dispatched.value.ui64 = + wmsum_value(&tqs->tqs_tasks_dispatched); + tqks->tqks_tasks_dispatched_delayed.value.ui64 = + wmsum_value(&tqs->tqs_tasks_dispatched_delayed); + tqks->tqks_tasks_executed_normal.value.ui64 = + wmsum_value(&tqs->tqs_tasks_executed_normal); + tqks->tqks_tasks_executed_priority.value.ui64 = + wmsum_value(&tqs->tqs_tasks_executed_priority); + tqks->tqks_tasks_executed.value.ui64 = + wmsum_value(&tqs->tqs_tasks_executed); + tqks->tqks_tasks_delayed_requeued.value.ui64 = + wmsum_value(&tqs->tqs_tasks_delayed_requeued); + tqks->tqks_tasks_cancelled.value.ui64 = + wmsum_value(&tqs->tqs_tasks_cancelled); + tqks->tqks_thread_wakeups.value.ui64 = + wmsum_value(&tqs->tqs_thread_wakeups); + tqks->tqks_thread_wakeups_nowork.value.ui64 = + wmsum_value(&tqs->tqs_thread_wakeups_nowork); + tqks->tqks_thread_sleeps.value.ui64 = + wmsum_value(&tqs->tqs_thread_sleeps); + + return (0); +} + +static void +taskq_kstats_init(taskq_t *tq) +{ + char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ + snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance); + + kstat_t *ksp = kstat_create("taskq", 0, name, "misc", + KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + ksp->ks_private = tq; + ksp->ks_update = taskq_kstats_update; + ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP); + memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t)); + kstat_install(ksp); + + tq->tq_ksp = ksp; +} + +static void +taskq_kstats_fini(taskq_t *tq) +{ + if (tq->tq_ksp == NULL) + return; + + kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t)); + kstat_delete(tq->tq_ksp); + + tq->tq_ksp = NULL; +} + taskq_t * taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) @@ -1104,6 +1385,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri, init_waitqueue_head(&tq->tq_wait_waitq); tq->tq_lock_class = TQ_LOCK_GENERAL; INIT_LIST_HEAD(&tq->tq_taskqs); + taskq_stats_init(tq); if (flags & TASKQ_PREPOPULATE) { spin_lock_irqsave_nested(&tq->tq_lock, irqflags, @@ -1137,14 +1419,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri, if (rc) { taskq_destroy(tq); - tq = NULL; - } else { - down_write(&tq_list_sem); - tq->tq_instance = taskq_find_by_name(name) + 1; - list_add_tail(&tq->tq_taskqs, &tq_list); - up_write(&tq_list_sem); + return (NULL); } + down_write(&tq_list_sem); + tq->tq_instance = taskq_find_by_name(name) + 1; + list_add_tail(&tq->tq_taskqs, &tq_list); + up_write(&tq_list_sem); + + /* Install kstats late, because the name includes tq_instance */ + taskq_kstats_init(tq); + return (tq); } EXPORT_SYMBOL(taskq_create); @@ -1177,6 +1462,8 @@ taskq_destroy(taskq_t *tq) taskq_wait(tq); + taskq_kstats_fini(tq); + /* remove taskq from global list used by the kstats */ down_write(&tq_list_sem); list_del(&tq->tq_taskqs); @@ -1230,6 +1517,7 @@ taskq_destroy(taskq_t *tq) spin_unlock_irqrestore(&tq->tq_lock, flags); + taskq_stats_fini(tq); kmem_strfree(tq->tq_name); kmem_free(tq, sizeof (taskq_t)); } @@ -1271,6 +1559,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri, } EXPORT_SYMBOL(taskq_create_synced); +static kstat_t *taskq_summary_ksp = NULL; + +static int +spl_taskq_kstat_headers(char *buf, size_t size) +{ + size_t n = snprintf(buf, size, + "%-20s | %-17s | %-23s\n" + "%-20s | %-17s | %-23s\n" + "%-20s | %-17s | %-23s\n", + "", "threads", "tasks on queue", + "taskq name", "tot [act idl] max", " pend [ norm high] dly", + "--------------------", "-----------------", + "-----------------------"); + return (n >= size ? ENOMEM : 0); +} + +static int +spl_taskq_kstat_data(char *buf, size_t size, void *data) +{ + struct list_head *tql = NULL; + taskq_t *tq; + char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ + char threads[25]; + char tasks[30]; + size_t n; + int err = 0; + + down_read(&tq_list_sem); + list_for_each_prev(tql, &tq_list) { + tq = list_entry(tql, taskq_t, tq_taskqs); + + mutex_enter(tq->tq_ksp->ks_lock); + taskq_kstats_update(tq->tq_ksp, KSTAT_READ); + taskq_kstats_t *tqks = tq->tq_ksp->ks_data; + + snprintf(name, sizeof (name), "%s.%d", tq->tq_name, + tq->tq_instance); + snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu", + tqks->tqks_threads_total.value.ui64, + tqks->tqks_threads_active.value.ui64, + tqks->tqks_threads_idle.value.ui64, + tqks->tqks_threads_max.value.ui64); + snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu", + tqks->tqks_tasks_total.value.ui64, + tqks->tqks_tasks_pending.value.ui64, + tqks->tqks_tasks_priority.value.ui64, + tqks->tqks_tasks_delayed.value.ui64); + + mutex_exit(tq->tq_ksp->ks_lock); + + n = snprintf(buf, size, "%-20s | %-17s | %-23s\n", + name, threads, tasks); + if (n >= size) { + err = ENOMEM; + break; + } + + buf = &buf[n]; + size -= n; + } + + up_read(&tq_list_sem); + + return (err); +} + +static void +spl_taskq_kstat_init(void) +{ + kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return; + + ksp->ks_data = (void *)(uintptr_t)1; + ksp->ks_ndata = 1; + kstat_set_raw_ops(ksp, spl_taskq_kstat_headers, + spl_taskq_kstat_data, NULL); + kstat_install(ksp); + + taskq_summary_ksp = ksp; +} + +static void +spl_taskq_kstat_fini(void) +{ + if (taskq_summary_ksp == NULL) + return; + + kstat_delete(taskq_summary_ksp); + taskq_summary_ksp = NULL; +} + static unsigned int spl_taskq_kick = 0; /* @@ -1451,12 +1833,16 @@ spl_taskq_init(void) */ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; + spl_taskq_kstat_init(); + return (0); } void spl_taskq_fini(void) { + spl_taskq_kstat_fini(); + taskq_destroy(dynamic_taskq); dynamic_taskq = NULL; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c index dbb8eefa7ec4..2af766ac2049 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c @@ -186,6 +186,13 @@ issig(void) schedule(); #endif + /* + * Dequeued SIGSTOP/SIGTSTP. + * Check if process has other singal pending. + */ + if (signal_pending(current)) + return (1); + return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index f7af20c619a4..60287ccdda98 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -58,22 +58,16 @@ #include #include #include -#ifdef _KERNEL #include #include #include #include -#endif -#ifdef _KERNEL #if defined(MAX_ORDER) #define ABD_MAX_ORDER (MAX_ORDER) #elif defined(MAX_PAGE_ORDER) #define ABD_MAX_ORDER (MAX_PAGE_ORDER) #endif -#else -#define ABD_MAX_ORDER (1) -#endif typedef struct abd_stats { kstat_named_t abdstat_struct_size; @@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL; struct page; /* - * _KERNEL - Will point to ZERO_PAGE if it is available or it will be - * an allocated zero'd PAGESIZE buffer. - * Userspace - Will be an allocated zero'ed PAGESIZE buffer. - * - * abd_zero_page is assigned to each of the pages of abd_zero_scatter. + * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will + * point to ZERO_PAGE if it is available or it will be an allocated zero'd + * PAGESIZE buffer. */ static struct page *abd_zero_page = NULL; @@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd) ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); } -#ifdef _KERNEL static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; /* @@ -509,7 +500,7 @@ abd_alloc_zero_scatter(void) ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK; abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { sg_set_page(sg, abd_zero_page, PAGESIZE, 0); @@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void) ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); } -#else /* _KERNEL */ - -#ifndef PAGE_SHIFT -#define PAGE_SHIFT (highbit64(PAGESIZE)-1) -#endif - -#define zfs_kmap_local(chunk) ((void *)chunk) -#define zfs_kunmap_local(addr) do { (void)(addr); } while (0) -#define local_irq_save(flags) do { (void)(flags); } while (0) -#define local_irq_restore(flags) do { (void)(flags); } while (0) -#define nth_page(pg, i) \ - ((struct page *)((void *)(pg) + (i) * PAGESIZE)) - -struct scatterlist { - struct page *page; - int length; - int end; -}; - -static void -sg_init_table(struct scatterlist *sg, int nr) -{ - memset(sg, 0, nr * sizeof (struct scatterlist)); - sg[nr - 1].end = 1; -} - -/* - * This must be called if any of the sg_table allocation functions - * are called. - */ -static void -abd_free_sg_table(abd_t *abd) -{ - int nents = ABD_SCATTER(abd).abd_nents; - vmem_free(ABD_SCATTER(abd).abd_sgl, - nents * sizeof (struct scatterlist)); -} - -#define for_each_sg(sgl, sg, nr, i) \ - for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) - -static inline void -sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) -{ - /* currently we don't use offset */ - ASSERT(offset == 0); - sg->page = page; - sg->length = len; -} - -static inline struct page * -sg_page(struct scatterlist *sg) -{ - return (sg->page); -} - -static inline struct scatterlist * -sg_next(struct scatterlist *sg) -{ - if (sg->end) - return (NULL); - - return (sg + 1); -} - -void -abd_alloc_chunks(abd_t *abd, size_t size) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(size); - struct scatterlist *sg; - int i; - - ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); - - abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - sg_set_page(sg, p, PAGESIZE, 0); - } - ABD_SCATTER(abd).abd_nents = nr_pages; -} - -void -abd_free_chunks(abd_t *abd) -{ - int i, n = ABD_SCATTER(abd).abd_nents; - struct scatterlist *sg; - - abd_for_each_sg(abd, sg, n, i) { - struct page *p = nth_page(sg_page(sg), 0); - umem_free_aligned(p, PAGESIZE); - } - abd_free_sg_table(abd); -} - -static void -abd_alloc_zero_scatter(void) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); - struct scatterlist *sg; - int i; - - abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - memset(abd_zero_page, 0, PAGESIZE); - abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); - abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; - abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; - ABD_SCATTER(abd_zero_scatter).abd_offset = 0; - ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; - abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - - sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); - - abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { - sg_set_page(sg, abd_zero_page, PAGESIZE, 0); - } - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); -} - -#endif /* _KERNEL */ - boolean_t abd_size_alloc_linear(size_t size) { @@ -712,14 +575,10 @@ abd_free_zero_scatter(void) abd_free_struct(abd_zero_scatter); abd_zero_scatter = NULL; ASSERT3P(abd_zero_page, !=, NULL); -#if defined(_KERNEL) #if defined(HAVE_ZERO_PAGE_GPL_ONLY) abd_unmark_zfs_page(abd_zero_page); __free_page(abd_zero_page); #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ -#else - umem_free_aligned(abd_zero_page, PAGESIZE); -#endif /* _KERNEL */ } static int @@ -1014,8 +873,6 @@ abd_cache_reap_now(void) { } -#if defined(_KERNEL) - /* * This is abd_iter_page(), the function underneath abd_iterate_page_func(). * It yields the next page struct and data offset and size within it, without @@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); - -#endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index 75a9ea53225e..c6b9cb2ddb3f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) * See also the comment above zfs_arc_shrinker_limit. */ int64_t can_free = btop(arc_evictable_memory()); - int64_t limit = zfs_arc_shrinker_limit != 0 ? - zfs_arc_shrinker_limit : INT64_MAX; - return (MIN(can_free, limit)); + if (current_is_kswapd() && zfs_arc_shrinker_limit) + can_free = MIN(can_free, zfs_arc_shrinker_limit); + return (can_free); } static unsigned long diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 54ed70d0394f..e042116333fb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; - char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, - NULL }; + char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n", + NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; @@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); - argv[5] = full_name; - argv[6] = full_path; + argv[6] = full_name; + argv[7] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index d98d32c1f9fb..0a82b8858eb8 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -292,6 +292,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) { struct super_block *s; objset_t *os; + boolean_t issnap = B_FALSE; int err; err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); @@ -323,6 +324,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) if (zpl_enter(zfsvfs, FTAG) == 0) { if (os != zfsvfs->z_os) err = -SET_ERROR(EBUSY); + issnap = zfsvfs->z_issnap; zpl_exit(zfsvfs, FTAG); } else { err = -SET_ERROR(EBUSY); @@ -346,7 +348,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) return (ERR_PTR(err)); } s->s_flags |= SB_ACTIVE; - } else if ((flags ^ s->s_flags) & SB_RDONLY) { + } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) { + /* + * Skip ro check for snap since snap is always ro regardless + * ro flag is passed by mount or not. + */ deactivate_locked_super(s); return (ERR_PTR(-EBUSY)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 2beec6436bff..d1e3061b50e6 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1213,6 +1213,7 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits, qlimits->io_opt = limits->zql_io_opt; qlimits->physical_block_size = limits->zql_physical_block_size; qlimits->max_discard_sectors = limits->zql_max_discard_sectors; + qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; qlimits->discard_granularity = limits->zql_discard_granularity; #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES qlimits->features = @@ -1251,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; - zvol_queue_limits_apply(limits, zso->zvo_queue); #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) struct queue_limits qlimits; zvol_queue_limits_convert(limits, &qlimits); @@ -1261,13 +1261,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) return (1); } -#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES - blk_queue_set_write_cache(zso->zvo_queue, B_TRUE); -#endif - zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; + #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) @@ -1361,7 +1358,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) * request queue and generic disk structures for the block device. */ static zvol_state_t * -zvol_alloc(dev_t dev, const char *name) +zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) { zvol_state_t *zv; struct zvol_state_os *zso; @@ -1381,6 +1378,7 @@ zvol_alloc(dev_t dev, const char *name) zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; + zv->zv_volblocksize = volblocksize; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1670,7 +1668,8 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - zv = zvol_alloc(MKDEV(zvol_major, minor), name); + zv = zvol_alloc(MKDEV(zvol_major, minor), name, + doi->doi_data_block_size); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; @@ -1680,7 +1679,6 @@ zvol_os_create_minor(const char *name) if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c index 309d9bf14cd4..8dec5f27b0af 100644 --- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c +++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c @@ -754,6 +754,12 @@ zpool_feature_init(void) "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_FAST_DEDUP, + "com.klarasystems:fast_dedup", "fast_dedup", + "Support for advanced deduplication", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/sys/contrib/openzfs/module/zcommon/zfs_valstr.c b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c new file mode 100644 index 000000000000..e2d4d1aefefb --- /dev/null +++ b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024, Klara Inc. + */ + +#include +#include +#include +#include +#include +#include "zfs_valstr.h" + +/* + * Each bit in a bitfield has three possible string representations: + * - single char + * - two-char pair + * - full name + */ +typedef struct { + const char vb_bit; + const char vb_pair[2]; + const char *vb_name; +} valstr_bit_t; + +/* + * Emits a character for each bit in `bits`, up to the number of elements + * in the table. Set bits get the character in vb_bit, clear bits get a + * space. This results in all strings having the same width, for easier + * visual comparison. + */ +static size_t +valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + out[n++] = (bits & mask) ? table[b].vb_bit : ' '; + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and + * separated by a `|` character. This gives a concise representation of the + * whole value. + */ +static size_t +valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + ASSERT3U(n, <=, outlen); + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + if (bits & mask) { + size_t len = (n > 0) ? 3 : 2; + if (n > outlen-len) + break; + if (n > 0) + out[n++] = '|'; + out[n++] = table[b].vb_pair[0]; + out[n++] = table[b].vb_pair[1]; + } + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits the full name for each bit set in `bits`, taken from vb_name, and + * separated by a space. This unambiguously shows the entire set of bits, but + * can get very long. + */ +static size_t +valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + ASSERT3U(n, <=, outlen); + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + if (bits & mask) { + size_t len = strlen(table[b].vb_name); + if (n > 0) + len++; + if (n > outlen-len) + break; + if (n > 0) { + out[n++] = ' '; + len--; + } + memcpy(&out[n], table[b].vb_name, len); + n += len; + } + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits the name of the given enum value in the table. + */ +static size_t +valstr_enum_str(const char **table, const size_t nelems, + int v, char *out, size_t outlen) +{ + ASSERT(out); + ASSERT3U(v, <, nelems); + if (v >= nelems) + return (0); + return (MIN(strlcpy(out, table[v], outlen), outlen)); +} + +/* + * These macros create the string tables for the given name, and implement + * the public functions described in zfs_valstr.h. + */ +#define _VALSTR_BITFIELD_IMPL(name, ...) \ +static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\ +size_t \ +zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_bits(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + \ +size_t \ +zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_pairs(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + \ +size_t \ +zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_str(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + +#define _VALSTR_ENUM_IMPL(name, ...) \ +static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \ +size_t \ +zfs_valstr_ ## name(int v, char *out, size_t outlen) \ +{ \ + return (valstr_enum_str(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \ +} \ + + +/* String tables */ + +/* ZIO flags: zio_flag_t, typically zio->io_flags */ +/* BEGIN CSTYLED */ +_VALSTR_BITFIELD_IMPL(zio_flag, + { '.', "DA", "DONT_AGGREGATE" }, + { '.', "RP", "IO_REPAIR" }, + { '.', "SH", "SELF_HEAL" }, + { '.', "RS", "RESILVER" }, + { '.', "SC", "SCRUB" }, + { '.', "ST", "SCAN_THREAD" }, + { '.', "PH", "PHYSICAL" }, + { '.', "CF", "CANFAIL" }, + { '.', "SP", "SPECULATIVE" }, + { '.', "CW", "CONFIG_WRITER" }, + { '.', "DR", "DONT_RETRY" }, + { '?', "??", "[UNUSED 11]" }, + { '.', "ND", "NODATA" }, + { '.', "ID", "INDUCE_DAMAGE" }, + { '.', "AL", "IO_ALLOCATING" }, + { '.', "RE", "IO_RETRY" }, + { '.', "PR", "PROBE" }, + { '.', "TH", "TRYHARD" }, + { '.', "OP", "OPTIONAL" }, + { '.', "DQ", "DONT_QUEUE" }, + { '.', "DP", "DONT_PROPAGATE" }, + { '.', "BY", "IO_BYPASS" }, + { '.', "RW", "IO_REWRITE" }, + { '.', "CM", "RAW_COMPRESS" }, + { '.', "EN", "RAW_ENCRYPT" }, + { '.', "GG", "GANG_CHILD" }, + { '.', "DD", "DDT_CHILD" }, + { '.', "GF", "GODFATHER" }, + { '.', "NP", "NOPWRITE" }, + { '.', "EX", "REEXECUTED" }, + { '.', "DG", "DELEGATED" }, +) +/* END CSTYLED */ + +/* + * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or + * zio->io_pipeline. + */ +/* BEGIN CSTYLED */ +_VALSTR_BITFIELD_IMPL(zio_stage, + { 'O', "O ", "OPEN" }, + { 'I', "RI", "READ_BP_INIT" }, + { 'I', "WI", "WRITE_BP_INIT" }, + { 'I', "FI", "FREE_BP_INIT" }, + { 'A', "IA", "ISSUE_ASYNC" }, + { 'W', "WC", "WRITE_COMPRESS" }, + { 'E', "EN", "ENCRYPT" }, + { 'C', "CG", "CHECKSUM_GENERATE" }, + { 'N', "NW", "NOP_WRITE" }, + { 'B', "BF", "BRT_FREE" }, + { 'd', "dS", "DDT_READ_START" }, + { 'd', "dD", "DDT_READ_DONE" }, + { 'd', "dW", "DDT_WRITE" }, + { 'd', "dF", "DDT_FREE" }, + { 'G', "GA", "GANG_ASSEMBLE" }, + { 'G', "GI", "GANG_ISSUE" }, + { 'D', "DT", "DVA_THROTTLE" }, + { 'D', "DA", "DVA_ALLOCATE" }, + { 'D', "DF", "DVA_FREE" }, + { 'D', "DC", "DVA_CLAIM" }, + { 'R', "R ", "READY" }, + { 'V', "VS", "VDEV_IO_START" }, + { 'V', "VD", "VDEV_IO_DONE" }, + { 'V', "VA", "VDEV_IO_ASSESS" }, + { 'C', "CV", "CHECKSUM_VERIFY" }, + { 'X', "X ", "DONE" }, +) +/* END CSTYLED */ + +/* ZIO priority: zio_priority_t, typically zio->io_priority */ +/* BEGIN CSTYLED */ +_VALSTR_ENUM_IMPL(zio_priority, + "SYNC_READ", + "SYNC_WRITE", + "ASYNC_READ", + "ASYNC_WRITE", + "SCRUB", + "REMOVAL", + "INITIALIZING", + "TRIM", + "REBUILD", + "[NUM_QUEUEABLE]", + "NOW", +) +/* END CSTYLED */ + +#undef _VALSTR_BITFIELD_IMPL +#undef _VALSTR_ENUM_IMPL diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 94f492522f0d..c8c4d2270fae 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -113,7 +113,7 @@ abd_verify(abd_t *abd) ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | - ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); + ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { @@ -603,13 +603,11 @@ abd_get_zeros(size_t size) } /* - * Allocate a linear ABD structure for buf. + * Create a linear ABD for an existing buf. */ -abd_t * -abd_get_from_buf(void *buf, size_t size) +static abd_t * +abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size) { - abd_t *abd = abd_alloc_struct(0); - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); /* @@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size) return (abd); } +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + return (abd_get_from_buf_impl(abd, buf, size)); +} + +abd_t * +abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size) +{ + abd_init_struct(abd); + return (abd_get_from_buf_impl(abd, buf, size)); +} + /* * Get the raw buffer associated with a linear ABD. */ diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 78c2cf8ec5c3..714a30e863a7 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); - void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; + boolean_t free_abd = B_FALSE; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT3P(abd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. @@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - + abd = NULL; csize = zio_compress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel); - ASSERT3P(tmpbuf, !=, NULL); + hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel); + ASSERT3P(abd, !=, NULL); ASSERT3U(csize, <=, psize); - abd = abd_get_from_buf(tmpbuf, lsize); - abd_take_ownership_of_buf(abd, B_TRUE); abd_zero_off(abd, csize, psize - csize); + free_abd = B_TRUE; } /* @@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); - else if (ret != ENOENT) - goto error; + else if (ret == ENOENT) + ret = 0; - if (tmpbuf != NULL) - abd_free(abd); - - return (0); - -error: - if (tmpbuf != NULL) + if (free_abd) abd_free(abd); return (ret); @@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; - void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); @@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); - tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { - abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } - abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; @@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, /* Skip byteswapping and checksumming (already done) */ return (0); } else { + abd_t dabd; + abd_get_from_buf_struct(&dabd, buf->b_data, + HDR_GET_LSIZE(hdr)); error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, buf->b_data, + hdr->b_l1hdr.b_pabd, &dabd, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); + abd_free(&dabd); /* * Absent hardware errors or software bugs, this should @@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); - void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { - abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } - abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; @@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - size_t bufsize = MAX(size, asize); - void *buf = zio_buf_alloc(bufsize); - uint64_t csize = zio_compress_data(compress, to_write, &buf, + cabd = abd_alloc_for_io(MAX(size, asize), ismd); + uint64_t csize = zio_compress_data(compress, to_write, &cabd, size, hdr->b_complevel); if (csize > psize) { /* @@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, * psize. Even if it fits into asize, it does not * matter, since checksum will never match on read. */ - zio_buf_free(buf, bufsize); + abd_free(cabd); return (SET_ERROR(EIO)); } if (asize > csize) - memset((char *)buf + csize, 0, asize - csize); - to_write = cabd = abd_get_from_buf(buf, bufsize); - abd_take_ownership_of_buf(cabd, B_TRUE); + abd_zero_off(cabd, csize, asize - csize); + to_write = cabd; } if (HDR_ENCRYPTED(hdr)) { @@ -9158,12 +9146,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* - * If pass == 1 or 3, we cache MRU metadata and data - * respectively. + * pass == 0: MFU meta + * pass == 1: MRU meta + * pass == 2: MFU data + * pass == 3: MRU data */ - if (l2arc_mfuonly) { + if (l2arc_mfuonly == 1) { if (pass == 1 || pass == 3) continue; + } else if (l2arc_mfuonly > 1) { + if (pass == 3) + continue; } uint64_t passed_sz = 0; @@ -10179,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev, { int err = 0; zio_cksum_t cksum; - abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); @@ -10241,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev, switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; - case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(asize, B_TRUE); + case ZIO_COMPRESS_LZ4: { + abd_t *abd = abd_alloc_linear(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); - if ((err = zio_decompress_data( + abd_t dabd; + abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb)); + err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), - abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { + abd, &dabd, asize, sizeof (*this_lb), NULL); + abd_free(&dabd); + abd_free(abd); + if (err != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; + } default: err = SET_ERROR(EINVAL); goto cleanup; @@ -10267,8 +10265,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev, l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } - if (abd != NULL) - abd_free(abd); return (err); } @@ -10504,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; - uint8_t *tmpbuf = NULL; + abd_t *abd = NULL; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); @@ -10527,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, - abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0); + abd_buf->abd, &abd, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); @@ -10553,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ - memset(tmpbuf + psize, 0, asize - psize); + abd_zero_off(abd, psize, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ - memcpy(tmpbuf, lb, sizeof (*lb)); + abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ - fletcher_4_native(tmpbuf, asize, NULL, + abd_fletcher_4_native(abd, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ - abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); - abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + abd_buf->abd = abd; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); diff --git a/sys/contrib/openzfs/module/zfs/blkptr.c b/sys/contrib/openzfs/module/zfs/blkptr.c index d85f0737f6f6..ac801c2bcf3f 100644 --- a/sys/contrib/openzfs/module/zfs/blkptr.c +++ b/sys/contrib/openzfs/module/zfs/blkptr.c @@ -142,8 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { uint8_t dstbuf[BPE_PAYLOAD_SIZE]; decode_embedded_bp_compressed(bp, dstbuf); - VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), - dstbuf, buf, psize, buflen, NULL)); + abd_t cabd, dabd; + abd_get_from_buf_struct(&cabd, dstbuf, psize); + abd_get_from_buf_struct(&dabd, buf, buflen); + VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd, + &dabd, psize, buflen, NULL)); + abd_free(&dabd); + abd_free(&cabd); } else { ASSERT3U(lsize, ==, psize); decode_embedded_bp_compressed(bp, buf); diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c index 8faa6c2a2528..914260e742f9 100644 --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -204,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk) void dataset_kstats_rename(dataset_kstats_t *dk, const char *name) { + if (dk->dk_kstats == NULL) + return; + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; char *ds_name; diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index d70ae1a031d5..710b12bbef51 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * # DDT: Deduplication tables @@ -74,12 +75,19 @@ * fill the BP with the DVAs from the entry, increment the refcount and cause * the write IO to return immediately. * - * Each ddt_phys_t slot in the entry represents a separate dedup block for the - * same content/checksum. The slot is selected based on the zp_copies parameter - * the block is written with, that is, the number of DVAs in the block. The - * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" - * feature. These are no longer written, and will be freed if encountered on - * old pools. + * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup + * block for the same content/checksum. The slot is selected based on the + * zp_copies parameter the block is written with, that is, the number of DVAs + * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for + * now-removed "dedupditto" feature. These are no longer written, and will be + * freed if encountered on old pools. + * + * If the "fast_dedup" feature is enabled, new dedup tables will be created + * with the "flat phys" option. In this mode, there is only one ddt_phys_t + * slot. If a write is issued for an entry that exists, but has fewer DVAs, + * then only as many new DVAs are allocated and written to make up the + * shortfall. The existing entry is then extended (ddt_phys_extend()) with the + * new DVAs. * * ## Lifetime of an entry * @@ -117,6 +125,35 @@ * without which, no space would be recovered and the DDT would continue to be * considered "over quota". See zap_shrink_enabled. * + * ## Dedup table pruning + * + * As a complement to the dedup quota feature, ddtprune allows removal of older + * non-duplicate entries to make room for newer duplicate entries. The amount + * to prune can be based on a target percentage of the unique entries or based + * on the age (i.e., prune unique entry older than N days). + * + * ## Dedup log + * + * Historically, all entries modified on a txg were written back to dedup + * storage objects at the end of every txg. This could cause significant + * overheads, as each entry only takes up a tiny portion of a ZAP leaf node, + * and so required reading the whole node, updating the entry, and writing it + * back. On busy pools, this could add serious IO and memory overheads. + * + * To address this, the dedup log was added. If the "fast_dedup" feature is + * enabled, at the end of each txg, modified entries will be copied to an + * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the + * same block is requested again, the in-memory object will be checked first, + * and if its there, the entry inflated back onto the live tree without going + * to storage. The on-disk log is only read at pool import time, to reload the + * in-memory log. + * + * Each txg, some amount of the in-memory log will be flushed out to a DDT + * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to + * keep up with the rate of change on dedup entries, but not so much that it + * would impact overall throughput, and not using too much memory. See the + * zfs_dedup_log_* tuneables in zfs(4) for more details. + * * ## Repair IO * * If a read on a dedup block fails, but there are other copies of the block in @@ -129,6 +166,16 @@ * from the alternate block. If the block is actually damaged, this will invoke * the pool's "self-healing" mechanism, and repair the block. * + * If the "fast_dedup" feature is enabled, the "flat phys" option will be in + * use, so there is only ever one ddt_phys_t slot. The repair process will + * still happen in this case, though it is unlikely to succeed as there will + * usually be no other equivalent blocks to fall back on (though there might + * be, if this was an early version of a dedup'd block that has since been + * extended). + * + * Note that this repair mechanism is in addition to and separate from the + * regular OpenZFS scrub and self-healing mechanisms. + * * ## Scanning (scrub/resilver) * * If dedup is active, the scrub machinery will walk the dedup table first, and @@ -143,6 +190,12 @@ * position on the object even if the object changes, the pool is exported, or * OpenZFS is upgraded. * + * If the "fast_dedup" feature is enabled and the table has a log, the scan + * cannot begin until entries on the log are flushed, as the on-disk log has no + * concept of a "stable position". Instead, the log flushing process will enter + * a more aggressive mode, to flush out as much as is necesary as soon as + * possible, in order to begin the scan as soon as possible. + * * ## Interaction with block cloning * * If block cloning and dedup are both enabled on a pool, BRT will look for the @@ -161,7 +214,15 @@ c == ZIO_CHECKSUM_BLAKE3) static kmem_cache_t *ddt_cache; -static kmem_cache_t *ddt_entry_cache; + +static kmem_cache_t *ddt_entry_flat_cache; +static kmem_cache_t *ddt_entry_trad_cache; + +#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE) /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. @@ -174,6 +235,39 @@ int zfs_dedup_prefetch = 0; */ uint_t dedup_class_wait_txgs = 5; +/* + * How many DDT prune entries to add to the DDT sync AVL tree. + * Note these addtional entries have a memory footprint of a + * ddt_entry_t (216 bytes). + */ +static uint32_t zfs_ddt_prunes_per_txg = 50000; + +/* + * For testing, synthesize aged DDT entries + * (in global scope for ztest) + */ +boolean_t ddt_prune_artificial_age = B_FALSE; +boolean_t ddt_dump_prune_histogram = B_FALSE; + +/* + * Don't do more than this many incremental flush passes per txg. + */ +uint_t zfs_dedup_log_flush_passes_max = 8; + +/* + * Minimum time to flush per txg. + */ +uint_t zfs_dedup_log_flush_min_time_ms = 1000; + +/* + * Minimum entries to flush per txg. + */ +uint_t zfs_dedup_log_flush_entries_min = 1000; + +/* + * Number of txgs to average flow rates across. + */ +uint_t zfs_dedup_log_flush_flow_rate_txgs = 10; static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { &ddt_zap_ops, @@ -185,6 +279,88 @@ static const char *const ddt_class_name[DDT_CLASSES] = { "unique", }; +/* + * DDT feature flags automatically enabled for each on-disk version. Note that + * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled. + */ +static const uint64_t ddt_version_flags[] = { + [DDT_VERSION_LEGACY] = 0, + [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, +}; + +/* per-DDT kstats */ +typedef struct { + /* total lookups and whether they returned new or existing entries */ + kstat_named_t dds_lookup; + kstat_named_t dds_lookup_new; + kstat_named_t dds_lookup_existing; + + /* entries found on live tree, and if we had to wait for load */ + kstat_named_t dds_lookup_live_hit; + kstat_named_t dds_lookup_live_wait; + kstat_named_t dds_lookup_live_miss; + + /* entries found on log trees */ + kstat_named_t dds_lookup_log_hit; + kstat_named_t dds_lookup_log_active_hit; + kstat_named_t dds_lookup_log_flushing_hit; + kstat_named_t dds_lookup_log_miss; + + /* entries found on store objects */ + kstat_named_t dds_lookup_stored_hit; + kstat_named_t dds_lookup_stored_miss; + + /* number of entries on log trees */ + kstat_named_t dds_log_active_entries; + kstat_named_t dds_log_flushing_entries; + + /* avg updated/flushed entries per txg */ + kstat_named_t dds_log_ingest_rate; + kstat_named_t dds_log_flush_rate; + kstat_named_t dds_log_flush_time_rate; +} ddt_kstats_t; + +static const ddt_kstats_t ddt_kstats_template = { + { "lookup", KSTAT_DATA_UINT64 }, + { "lookup_new", KSTAT_DATA_UINT64 }, + { "lookup_existing", KSTAT_DATA_UINT64 }, + { "lookup_live_hit", KSTAT_DATA_UINT64 }, + { "lookup_live_wait", KSTAT_DATA_UINT64 }, + { "lookup_live_miss", KSTAT_DATA_UINT64 }, + { "lookup_log_hit", KSTAT_DATA_UINT64 }, + { "lookup_log_active_hit", KSTAT_DATA_UINT64 }, + { "lookup_log_flushing_hit", KSTAT_DATA_UINT64 }, + { "lookup_log_miss", KSTAT_DATA_UINT64 }, + { "lookup_stored_hit", KSTAT_DATA_UINT64 }, + { "lookup_stored_miss", KSTAT_DATA_UINT64 }, + { "log_active_entries", KSTAT_DATA_UINT64 }, + { "log_flushing_entries", KSTAT_DATA_UINT64 }, + { "log_ingest_rate", KSTAT_DATA_UINT32 }, + { "log_flush_rate", KSTAT_DATA_UINT32 }, + { "log_flush_time_rate", KSTAT_DATA_UINT32 }, +}; + +#ifdef _KERNEL +#define _DDT_KSTAT_STAT(ddt, stat) \ + &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64 +#define DDT_KSTAT_BUMP(ddt, stat) \ + do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0) +#define DDT_KSTAT_ADD(ddt, stat, val) \ + do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) +#define DDT_KSTAT_SUB(ddt, stat, val) \ + do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) +#define DDT_KSTAT_SET(ddt, stat, val) \ + do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0) +#define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0) +#else +#define DDT_KSTAT_BUMP(ddt, stat) do {} while (0) +#define DDT_KSTAT_ADD(ddt, stat, val) do {} while (0) +#define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0) +#define DDT_KSTAT_SET(ddt, stat, val) do {} while (0) +#define DDT_KSTAT_ZERO(ddt, stat) do {} while (0) +#endif /* _KERNEL */ + + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) @@ -196,14 +372,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, ==, 0); VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); ASSERT3U(*objectp, !=, 0); - VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + + VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, + objectp, tx)); VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), @@ -220,13 +400,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t count; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY0(ddt_object_count(ddt, type, class, &count)); VERIFY0(count); - VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx)); VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); @@ -243,9 +425,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) char name[DDT_NAMELEN]; int error; + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then this object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + return (SET_ERROR(ENOENT)); + } + ddt_object_name(ddt, type, class, name); - error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); if (error != 0) return (error); @@ -315,7 +506,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], &dde->dde_key, - dde->dde_phys, sizeof (dde->dde_phys))); + dde->dde_phys, DDT_PHYS_SIZE(ddt))); } static int @@ -352,13 +543,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) static int ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - ddt_entry_t *dde, dmu_tx_t *tx) + const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, - sizeof (dde->dde_phys), tx)); + ddt->ddt_object[type][class], &ddlwe->ddlwe_key, + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx)); } static int @@ -373,13 +564,19 @@ ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - uint64_t *walk, ddt_entry_t *dde) + uint64_t *walk, ddt_lightweight_entry_t *ddlwe) { ASSERT(ddt_object_exists(ddt, type, class)); - return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, - ddt->ddt_object[type][class], walk, &dde->dde_key, - dde->dde_phys, sizeof (dde->dde_phys))); + int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); + if (error == 0) { + ddlwe->ddlwe_type = type; + ddlwe->ddlwe_class = class; + return (0); + } + return (error); } int @@ -413,13 +610,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, } void -ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg) { ASSERT3U(txg, !=, 0); + ASSERT3U(v, <, DDT_PHYS_NONE); + uint64_t phys_birth; + const dva_t *dvap; + + if (v == DDT_PHYS_FLAT) { + phys_birth = ddp->ddp_flat.ddp_phys_birth; + dvap = ddp->ddp_flat.ddp_dva; + } else { + phys_birth = ddp->ddp_trad[v].ddp_phys_birth; + dvap = ddp->ddp_trad[v].ddp_dva; + } for (int d = 0; d < SPA_DVAS_PER_BP; d++) - bp->blk_dva[d] = ddp->ddp_dva[d]; - BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); + bp->blk_dva[d] = dvap[d]; + BP_SET_BIRTH(bp, txg, phys_birth); } /* @@ -427,13 +636,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) * will be missing the salt / IV required to do a full decrypting read. */ void -ddt_bp_create(enum zio_checksum checksum, - const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp) { BP_ZERO(bp); if (ddp != NULL) - ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v)); bp->blk_cksum = ddk->ddk_cksum; @@ -464,42 +673,125 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) } void -ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) { - ASSERT0(ddp->ddp_phys_birth); + ASSERT3U(v, <, DDT_PHYS_NONE); + int bp_ndvas = BP_GET_NDVAS(bp); + int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ? + SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_GET_BIRTH(bp); -} + int s = 0, d = 0; + while (s < bp_ndvas && d < ddp_max_dvas) { + if (DVA_IS_VALID(&dvas[d])) { + d++; + continue; + } + dvas[d] = bp->blk_dva[s]; + s++; d++; + } -void -ddt_phys_clear(ddt_phys_t *ddp) -{ - memset(ddp, 0, sizeof (*ddp)); -} + /* + * If the caller offered us more DVAs than we can fit, something has + * gone wrong in their accounting. zio_ddt_write() should never ask for + * more than we need. + */ + ASSERT3U(s, ==, bp_ndvas); -void -ddt_phys_addref(ddt_phys_t *ddp) -{ - ddp->ddp_refcnt++; -} + if (BP_IS_ENCRYPTED(bp)) + dvas[2] = bp->blk_dva[2]; -void -ddt_phys_decref(ddt_phys_t *ddp) -{ - if (ddp) { - ASSERT3U(ddp->ddp_refcnt, >, 0); - ddp->ddp_refcnt--; + if (ddt_phys_birth(ddp, v) == 0) { + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); + else + ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); } } +void +ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + dst->ddp_flat = src->ddp_flat; + else + dst->ddp_trad[v] = src->ddp_trad[v]; +} + +void +ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE); + else + memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); +} + +static uint64_t +ddt_class_start(void) +{ + uint64_t start = gethrestime_sec(); + + if (ddt_prune_artificial_age) { + /* + * debug aide -- simulate a wider distribution + * so we don't have to wait for an aged DDT + * to test prune. + */ + int range = 1 << 21; + int percent = random_in_range(100); + if (percent < 50) { + range = range >> 4; + } else if (percent > 75) { + range /= 2; + } + start -= random_in_range(range); + } + + return (start); +} + +void +ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_refcnt++; + else + ddp->ddp_trad[v].ddp_refcnt++; +} + +uint64_t +ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + uint64_t *refcntp; + + if (v == DDT_PHYS_FLAT) + refcntp = &ddp->ddp_flat.ddp_refcnt; + else + refcntp = &ddp->ddp_trad[v].ddp_refcnt; + + ASSERT3U(*refcntp, >, 0); + (*refcntp)--; + return (*refcntp); +} + static void -ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp, + ddt_phys_variant_t v, uint64_t txg) { blkptr_t blk; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); /* * We clear the dedup bit so that zio_free() will actually free the @@ -507,30 +799,82 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) */ BP_SET_DEDUP(&blk, 0); - ddt_phys_clear(ddp); + ddt_phys_clear(ddp, v); zio_free(ddt->ddt_spa, txg, &blk); } -ddt_phys_t * -ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +uint64_t +ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + ASSERT3U(v, <, DDT_PHYS_NONE); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) - return (ddp); + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_phys_birth); + else + return (ddp->ddp_trad[v].ddp_phys_birth); +} + +int +ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + const dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + return (DVA_IS_VALID(&dvas[0]) + + DVA_IS_VALID(&dvas[1]) + + DVA_IS_VALID(&dvas[2]) * !encrypted); +} + +ddt_phys_variant_t +ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) +{ + if (dde == NULL) + return (DDT_PHYS_NONE); + + const ddt_univ_phys_t *ddp = dde->dde_phys; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && + BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { + return (DDT_PHYS_FLAT); + } + } else /* traditional phys */ { + for (int p = 0; p < DDT_PHYS_MAX; p++) { + if (DVA_EQUAL(BP_IDENTITY(bp), + &ddp->ddp_trad[p].ddp_dva[0]) && + BP_GET_BIRTH(bp) == + ddp->ddp_trad[p].ddp_phys_birth) { + return (p); + } + } } - return (NULL); + return (DDT_PHYS_NONE); } uint64_t -ddt_phys_total_refcnt(const ddt_entry_t *dde) +ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_refcnt); + else + return (ddp->ddp_trad[v].ddp_refcnt); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp) { uint64_t refcnt = 0; - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) - refcnt += dde->dde_phys[p].ddp_refcnt; + if (ddt->ddt_flags & DDT_FLAG_FLAT) + refcnt = ddp->ddp_flat.ddp_refcnt; + else + for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++) + refcnt += ddp->ddp_trad[v].ddp_refcnt; return (refcnt); } @@ -559,24 +903,37 @@ ddt_init(void) { ddt_cache = kmem_cache_create("ddt_cache", sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - ddt_entry_cache = kmem_cache_create("ddt_entry_cache", - sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache", + DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", + DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + + ddt_log_init(); } void ddt_fini(void) { - kmem_cache_destroy(ddt_entry_cache); + ddt_log_fini(); + + kmem_cache_destroy(ddt_entry_trad_cache); + kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); } static ddt_entry_t * -ddt_alloc(const ddt_key_t *ddk) +ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk) { ddt_entry_t *dde; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - memset(dde, 0, sizeof (ddt_entry_t)); + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_FLAT_SIZE); + } else { + dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_TRAD_SIZE); + } + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -584,17 +941,31 @@ ddt_alloc(const ddt_key_t *ddk) return (dde); } -static void -ddt_free(ddt_entry_t *dde) +void +ddt_alloc_entry_io(ddt_entry_t *dde) { - for (int p = 0; p < DDT_PHYS_TYPES; p++) - ASSERT3P(dde->dde_lead_zio[p], ==, NULL); + if (dde->dde_io != NULL) + return; - if (dde->dde_repair_abd != NULL) - abd_free(dde->dde_repair_abd); + dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP); +} + +static void +ddt_free(const ddt_t *ddt, ddt_entry_t *dde) +{ + if (dde->dde_io != NULL) { + for (int p = 0; p < DDT_NPHYS(ddt); p++) + ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL); + + if (dde->dde_io->dde_repair_abd != NULL) + abd_free(dde->dde_io->dde_repair_abd); + + kmem_free(dde->dde_io, sizeof (ddt_entry_io_t)); + } cv_destroy(&dde->dde_cv); - kmem_cache_free(ddt_entry_cache, dde); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_entry_flat_cache : ddt_entry_trad_cache, dde); } void @@ -602,8 +973,15 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) { ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + /* Entry is still in the log, so charge the entry back to it */ + if (dde->dde_flags & DDE_FLAG_LOGGED) { + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + } + avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); + ddt_free(ddt, dde); } static boolean_t @@ -684,8 +1062,51 @@ ddt_prefetch_all(spa_t *spa) } } +static int ddt_configure(ddt_t *ddt, boolean_t new); + +/* + * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them + * to the ones in the entry. If they're different, then the passed-in BP is + * from a previous generation of this entry (ie was previously pruned) and we + * have to act like the entry doesn't exist at all. + * + * This should only happen during a lookup to free the block (zio_ddt_free()). + * + * XXX this is similar in spirit to ddt_phys_select(), maybe can combine + * -- robn, 2024-02-09 + */ +static boolean_t +ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde) +{ + /* If the BP has no DVAs, then this entry is good */ + uint_t ndvas = BP_GET_NDVAS(bp); + if (ndvas == 0) + return (B_TRUE); + + /* + * Only checking the phys for the copies. For flat, there's only one; + * for trad it'll be the one that has the matching set of DVAs. + */ + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + dde->dde_phys->ddp_flat.ddp_dva : + dde->dde_phys->ddp_trad[ndvas].ddp_dva; + + /* + * Compare entry DVAs with the BP. They should all be there, but + * there's not really anything we can do if its only partial anyway, + * that's an error somewhere else, maybe long ago. + */ + uint_t d; + for (d = 0; d < ndvas; d++) + if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d])) + return (B_FALSE); + ASSERT3U(d, ==, ndvas); + + return (B_TRUE); +} + ddt_entry_t * -ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { spa_t *spa = ddt->ddt_spa; ddt_key_t search; @@ -697,6 +1118,17 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) { + /* + * This is the first use of this DDT since the pool was + * created; finish getting it ready for use. + */ + VERIFY0(ddt_configure(ddt, B_TRUE)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + } + + DDT_KSTAT_BUMP(ddt, dds_lookup); + ddt_key_fill(&search, bp); /* Find an existing live entry */ @@ -707,11 +1139,16 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) return (NULL); /* If it's already loaded, we can just return it. */ - if (dde->dde_flags & DDE_FLAG_LOADED) - return (dde); + DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit); + if (dde->dde_flags & DDE_FLAG_LOADED) { + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); + } /* Someone else is loading it, wait for it. */ dde->dde_waiters++; + DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait); while (!(dde->dde_flags & DDE_FLAG_LOADED)) cv_wait(&dde->dde_cv, &ddt->ddt_lock); dde->dde_waiters--; @@ -720,22 +1157,71 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) if (dde->dde_flags & DDE_FLAG_OVERQUOTA) { if (dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); + ddt_free(ddt, dde); } return (NULL); } - return (dde); - } + DDT_KSTAT_BUMP(ddt, dds_lookup_existing); - /* Not found. */ - if (!add) + /* Make sure the loaded entry matches the BP */ + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); return (NULL); + } else + DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss); /* Time to make a new entry. */ - dde = ddt_alloc(&search); + dde = ddt_alloc(ddt, &search); + + /* Record the time this class was created (used by ddt prune) */ + if (ddt->ddt_flags & DDT_FLAG_FLAT) + dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start(); + avl_insert(&ddt->ddt_tree, dde, where); + /* If its in the log tree, we can "load" it from there */ + if (ddt->ddt_flags & DDT_FLAG_LOG) { + ddt_lightweight_entry_t ddlwe; + + if (ddt_log_find_key(ddt, &search, &ddlwe)) { + /* + * See if we have the key first, and if so, set up + * the entry. + */ + dde->dde_type = ddlwe.ddlwe_type; + dde->dde_class = ddlwe.ddlwe_class; + memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, + DDT_PHYS_SIZE(ddt)); + /* Whatever we found isn't valid for this BP, eject */ + if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + + /* Remove it and count it */ + if (ddt_log_remove_key(ddt, + ddt->ddt_log_active, &search)) { + DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); + } else { + VERIFY(ddt_log_remove_key(ddt, + ddt->ddt_log_flushing, &search)); + DDT_KSTAT_BUMP(ddt, + dds_lookup_log_flushing_hit); + } + + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + + DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit); + DDT_KSTAT_BUMP(ddt, dds_lookup_existing); + + return (dde); + } + + DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss); + } + /* * ddt_tree is now stable, so unlock and let everyone else keep moving. * Anyone landing on this entry will find it without DDE_FLAG_LOADED, @@ -764,27 +1250,71 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + boolean_t valid = B_TRUE; + if (dde->dde_type == DDT_TYPES && dde->dde_class == DDT_CLASSES && ddt_over_quota(spa)) { /* Over quota. If no one is waiting, clean up right now. */ if (dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); + ddt_free(ddt, dde); return (NULL); } /* Flag cleanup required */ dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { - ddt_stat_update(ddt, dde, -1ULL); + /* + * If what we loaded is no good for this BP and there's no one + * waiting for it, we can just remove it and get out. If its no + * good but there are waiters, we have to leave it, because we + * don't know what they want. If its not needed we'll end up + * taking an entry log/sync, but it can only happen if more + * than one previous version of this block is being deleted at + * the same time. This is extremely unlikely to happen and not + * worth the effort to deal with without taking an entry + * update. + */ + valid = ddt_entry_lookup_is_valid(ddt, bp, dde); + if (!valid && dde->dde_waiters == 0) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + + DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit); + DDT_KSTAT_BUMP(ddt, dds_lookup_existing); + + /* + * The histograms only track inactive (stored or logged) blocks. + * We've just put an entry onto the live list, so we need to + * remove its counts. When its synced back, it'll be re-added + * to the right one. + * + * We only do this when we successfully found it in the store. + * error == ENOENT means this is a new entry, and so its already + * not counted. + */ + ddt_histogram_t *ddh = + &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_histogram_sub_entry(ddt, ddh, &ddlwe); + } else { + DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss); + DDT_KSTAT_BUMP(ddt, dds_lookup_new); } /* Entry loaded, everyone can proceed now */ dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); - return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde); + if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid) + return (NULL); + + return (dde); } void @@ -812,29 +1342,222 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp) } /* - * Key comparison. Any struct wanting to make use of this function must have - * the key as the first element. + * ddt_key_t comparison. Any struct wanting to make use of this function must + * have the key as the first element. Casts it to N uint64_ts, and checks until + * we find there's a difference. This is intended to match how ddt_zap.c drives + * the ZAPs (first uint64_t as the key prehash), which will minimise the number + * of ZAP blocks touched when flushing logged entries from an AVL walk. This is + * not an invariant for this function though, should you wish to change it. */ -#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) - -typedef struct ddt_key_cmp { - uint16_t u16[DDT_KEY_CMP_LEN]; -} ddt_key_cmp_t; - int ddt_key_compare(const void *x1, const void *x2) { - const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1; - const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2; - int32_t cmp = 0; + const uint64_t *k1 = (const uint64_t *)x1; + const uint64_t *k2 = (const uint64_t *)x2; - for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { - cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i]; - if (likely(cmp)) - break; + int cmp; + for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++) + if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0)) + return (cmp); + + return (0); +} + +/* Create the containing dir for this DDT and bump the feature count */ +static void +ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, ==, 0); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + ddt->ddt_dir_object = zap_create_link(ddt->ddt_os, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx); + + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, + sizeof (uint64_t), 1, &ddt->ddt_version, tx)); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, + sizeof (uint64_t), 1, &ddt->ddt_flags, tx)); + + spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* Destroy the containing dir and deactivate the feature */ +static void +ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, !=, 0); + ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ASSERT(!ddt_object_exists(ddt, type, class)); + } } - return (TREE_ISIGN(cmp)); + ddt_log_destroy(ddt, tx); + + uint64_t count; + ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS)); + ASSERT3U(count, ==, 2); + + VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx)); + + ddt->ddt_dir_object = 0; + + spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* + * Determine, flags and on-disk layout from what's already stored. If there's + * nothing stored, then if new is false, returns ENOENT, and if true, selects + * based on pool config. + */ +static int +ddt_configure(ddt_t *ddt, boolean_t new) +{ + spa_t *spa = ddt->ddt_spa; + char name[DDT_NAMELEN]; + int error; + + ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE); + + boolean_t fdt_enabled = + spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP); + boolean_t fdt_active = + spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP); + + /* + * First, look for the global DDT stats object. If its not there, then + * there's never been a DDT written before ever, and we know we're + * starting from scratch. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + if (error != 0) { + if (error != ENOENT) + return (error); + goto not_found; + } + + if (fdt_active) { + /* + * Now look for a DDT directory. If it exists, then it has + * everything we need. + */ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, + &ddt->ddt_dir_object); + if (error == 0) { + ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION, sizeof (uint64_t), 1, + &ddt->ddt_version); + if (error != 0) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_FLAGS, sizeof (uint64_t), 1, + &ddt->ddt_flags); + if (error != 0) + return (error); + + if (ddt->ddt_version != DDT_VERSION_FDT) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "unknown version %llu", spa_name(spa), + name, (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "version=%llu unknown flags %llx", + spa_name(spa), name, + (u_longlong_t)ddt->ddt_flags, + (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + return (0); + } + if (error != ENOENT) + return (error); + } + + /* Any object in the root indicates a traditional setup. */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ddt_object_name(ddt, type, class, name); + uint64_t obj; + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), + 1, &obj); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + + return (0); + } + } + +not_found: + if (!new) + return (SET_ERROR(ENOENT)); + + /* Nothing on disk, so set up for the best version we can */ + if (fdt_enabled) { + ddt->ddt_version = DDT_VERSION_FDT; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = 0; /* create on first use */ + } else { + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + } + + return (0); +} + +static void +ddt_table_alloc_kstats(ddt_t *ddt) +{ + char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa)); + char *name = kmem_asprintf("ddt_stats_%s", + zio_checksum_table[ddt->ddt_checksum].ci_name); + + ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED, + sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (ddt->ddt_ksp != NULL) { + ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP); + memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t)); + ddt->ddt_ksp->ks_data = dds; + kstat_install(ddt->ddt_ksp); + } + + kmem_strfree(name); + kmem_strfree(mod); } static ddt_t * @@ -844,15 +1567,19 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); memset(ddt, 0, sizeof (ddt_t)); - mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); avl_create(&ddt->ddt_repair_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + + ddt_log_alloc(ddt); + ddt_table_alloc_kstats(ddt); return (ddt); } @@ -860,6 +1587,13 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) static void ddt_table_free(ddt_t *ddt) { + if (ddt->ddt_ksp != NULL) { + kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t)); + ddt->ddt_ksp->ks_data = NULL; + kstat_delete(ddt->ddt_ksp); + } + + ddt_log_free(ddt); ASSERT0(avl_numnodes(&ddt->ddt_tree)); ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); avl_destroy(&ddt->ddt_tree); @@ -889,7 +1623,6 @@ ddt_load(spa_t *spa) error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); - if (error) return (error == ENOENT ? 0 : error); @@ -898,6 +1631,12 @@ ddt_load(spa_t *spa) continue; ddt_t *ddt = spa->spa_ddt[c]; + error = ddt_configure(ddt, B_FALSE); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -907,15 +1646,25 @@ ddt_load(spa_t *spa) } } + error = ddt_log_load(ddt); + if (error != 0 && error != ENOENT) + return (error); + + DDT_KSTAT_SET(ddt, dds_log_active_entries, + avl_numnodes(&ddt->ddt_log_active->ddl_tree)); + DDT_KSTAT_SET(ddt, dds_log_flushing_entries, + avl_numnodes(&ddt->ddt_log_flushing->ddl_tree)); + /* * Seed the cached histograms. */ memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); - spa->spa_dedup_dspace = ~0ULL; - spa->spa_dedup_dsize = ~0ULL; } + spa->spa_dedup_dspace = ~0ULL; + spa->spa_dedup_dsize = ~0ULL; + return (0); } @@ -964,7 +1713,8 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) ddt_key_fill(&ddk, bp); - dde = ddt_alloc(&ddk); + dde = ddt_alloc(ddt, &ddk); + ddt_alloc_entry_io(dde); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -979,7 +1729,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) } } - memset(dde->dde_phys, 0, sizeof (dde->dde_phys)); + memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt)); return (dde); } @@ -991,11 +1741,12 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) ddt_enter(ddt); - if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && + if (dde->dde_io->dde_repair_abd != NULL && + spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else - ddt_free(dde); + ddt_free(ddt, dde); ddt_exit(ddt); } @@ -1003,16 +1754,15 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) static void ddt_repair_entry_done(zio_t *zio) { + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *rdde = zio->io_private; - ddt_free(rdde); + ddt_free(ddt, rdde); } static void ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) { - ddt_phys_t *ddp = dde->dde_phys; - ddt_phys_t *rddp = rdde->dde_phys; ddt_key_t *ddk = &dde->dde_key; ddt_key_t *rddk = &rdde->dde_key; zio_t *zio; @@ -1021,15 +1771,31 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) zio = zio_null(rio, rio->io_spa, NULL, ddt_repair_entry_done, rdde, rio->io_flags); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth != rddp->ddp_phys_birth || - memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_univ_phys_t *rddp = rdde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(ddp, v); + const dva_t *dvas, *rdvas; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dvas = ddp->ddp_flat.ddp_dva; + rdvas = rddp->ddp_flat.ddp_dva; + } else { + dvas = ddp->ddp_trad[p].ddp_dva; + rdvas = rddp->ddp_trad[p].ddp_dva; + } + + if (phys_birth == 0 || + phys_birth != ddt_phys_birth(rddp, v) || + memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP)) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, - ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), + NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + ZIO_DDT_CHILD_FLAGS(zio), NULL)); } zio_nowait(zio); @@ -1051,7 +1817,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); - ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, + DDT_PHYS_NONE, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); @@ -1061,85 +1828,496 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) } static void -ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx) +{ + /* + * Count all the entries stored for each type/class, and updates the + * stats within (ddt_object_sync()). If there's no entries for the + * type/class, the whole object is removed. If all objects for the DDT + * are removed, its containing dir is removed, effectively resetting + * the entire DDT to an empty slate. + */ + uint64_t count = 0; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + uint64_t add, tcount = 0; + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + VERIFY0(ddt_object_count(ddt, type, class, + &add)); + tcount += add; + } + } + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + if (tcount == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + count += tcount; + } + + if (ddt->ddt_flags & DDT_FLAG_LOG) { + /* Include logged entries in the total count */ + count += avl_numnodes(&ddt->ddt_log_active->ddl_tree); + count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + } + + if (count == 0) { + /* + * No entries left on the DDT, so reset the version for next + * time. This allows us to handle the feature being changed + * since the DDT was originally created. New entries should get + * whatever the feature currently demands. + */ + if (ddt->ddt_version == DDT_VERSION_FDT) + ddt_destroy_dir(ddt, tx); + + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_flags = 0; + } + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; +} + +static void +ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; - ddt_phys_t *ddp = dde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - ddt_type_t otype = dde->dde_type; + + /* + * Compute the target class, so we can decide whether or not to inform + * the scrub traversal (below). Note that we don't store this in the + * entry, as it might change multiple times before finally being + * committed (if we're logging). Instead, we recompute it in + * ddt_sync_entry(). + */ + uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys); + ddt_class_t nclass = + (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + + /* + * If the class changes, the order that we scan this bp changes. If it + * decreases, we could miss it, so scan it right now. (This covers both + * class changing while we are doing ddt_walk(), and when we are + * traversing.) + * + * We also do this when the refcnt goes to zero, because that change is + * only in the log so far; the blocks on disk won't be freed until + * the log is flushed, and the refcnt might increase before that. If it + * does, then we could miss it in the same way. + */ + if (refcnt == 0 || nclass < ddlwe->ddlwe_class) + dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt, + ddlwe, tx); +} + +static void +ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, + ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx) +{ + ddt_key_t *ddk = &ddlwe->ddlwe_key; ddt_type_t ntype = DDT_TYPE_DEFAULT; - ddt_class_t oclass = dde->dde_class; - ddt_class_t nclass; - uint64_t total_refcnt = 0; + uint64_t refcnt = 0; - ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Compute the total refcnt. Along the way, issue frees for any DVAs + * we no longer want. + */ + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - ASSERT3P(dde->dde_lead_zio[p], ==, NULL); - if (ddp->ddp_phys_birth == 0) { - ASSERT0(ddp->ddp_refcnt); + if (ddt_phys_birth(ddp, v) == 0) { + ASSERT0(phys_refcnt); continue; } - if (p == DDT_PHYS_DITTO) { + if (DDT_PHYS_IS_DITTO(ddt, p)) { /* - * Note, we no longer create DDT-DITTO blocks, but we - * don't want to leak any written by older software. + * We don't want to keep any obsolete slots (eg ditto), + * regardless of their refcount, but we don't want to + * leak them either. So, free them. */ - ddt_phys_free(ddt, ddk, ddp, txg); + ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); continue; } - if (ddp->ddp_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, txg); - total_refcnt += ddp->ddp_refcnt; + if (phys_refcnt == 0) + /* No remaining references, free it! */ + ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg); + refcnt += phys_refcnt; } - /* We do not create new DDT-DITTO blocks. */ - ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth); - if (total_refcnt > 1) - nclass = DDT_CLASS_DUPLICATE; - else - nclass = DDT_CLASS_UNIQUE; + /* Select the best class for the entry. */ + ddt_class_t nclass = + (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + /* + * If an existing entry changed type or class, or its refcount reached + * zero, delete it from the DDT object + */ if (otype != DDT_TYPES && - (otype != ntype || oclass != nclass || total_refcnt == 0)) { + (otype != ntype || oclass != nclass || refcnt == 0)) { VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); - ASSERT3U( - ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); + ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT); } - if (total_refcnt != 0) { - dde->dde_type = ntype; - dde->dde_class = nclass; - ddt_stat_update(ddt, dde, 0); + /* + * Add or update the entry + */ + if (refcnt != 0) { + ddt_histogram_t *ddh = + &ddt->ddt_histogram[ntype][nclass]; + + ddt_histogram_add_entry(ddt, ddh, ddlwe); + if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); - VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); + VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx)); + } +} + +/* Calculate an exponential weighted moving average, lower limited to zero */ +static inline int32_t +_ewma(int32_t val, int32_t prev, uint32_t weight) +{ + ASSERT3U(val, >=, 0); + ASSERT3U(prev, >=, 0); + const int32_t new = + MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1)); + ASSERT3U(new, >=, 0); + return (new); +} + +/* Returns true if done for this txg */ +static boolean_t +ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) +{ + if (ddt->ddt_flush_pass == 0) { + if (spa_sync_pass(ddt->ddt_spa) == 1) { + /* First run this txg, get set up */ + ddt->ddt_flush_start = gethrtime(); + ddt->ddt_flush_count = 0; + + /* + * How many entries we need to flush. We want to at + * least match the ingest rate. + */ + ddt->ddt_flush_min = MAX( + ddt->ddt_log_ingest_rate, + zfs_dedup_log_flush_entries_min); + + /* + * If we've been asked to flush everything in a hurry, + * try to dump as much as possible on this txg. In + * this case we're only limited by time, not amount. + */ + if (ddt->ddt_flush_force_txg > 0) + ddt->ddt_flush_min = + MAX(ddt->ddt_flush_min, avl_numnodes( + &ddt->ddt_log_flushing->ddl_tree)); + } else { + /* We already decided we're done for this txg */ + return (B_FALSE); + } + } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { + /* + * We already did some flushing on this pass, skip it. This + * happens when dsl_process_async_destroys() runs during a scan + * (on pass 1) and does an additional ddt_sync() to update + * freed blocks. + */ + return (B_FALSE); + } + + if (spa_sync_pass(ddt->ddt_spa) > + MAX(zfs_dedup_log_flush_passes_max, 1)) { + /* Too many passes this txg, defer until next. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* Nothing to flush, done for this txg. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? + MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), + SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); + + uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; + + if (elapsed_time >= target_time) { + /* Too long since we started, done for this txg. */ + ddt->ddt_flush_pass = 0; + return (B_TRUE); + } + + ddt->ddt_flush_pass++; + ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); + + /* + * Estimate how much time we'll need to flush the remaining entries + * based on how long it normally takes. + */ + uint32_t want_time; + if (ddt->ddt_flush_pass == 1) { + /* First pass, use the average time/entries */ + if (ddt->ddt_log_flush_rate == 0) + /* Zero rate, just assume the whole time */ + want_time = target_time; + else + want_time = ddt->ddt_flush_min * + ddt->ddt_log_flush_time_rate / + ddt->ddt_log_flush_rate; + } else { + /* Later pass, calculate from this txg so far */ + want_time = ddt->ddt_flush_min * + elapsed_time / ddt->ddt_flush_count; + } + + /* Figure out how much time we have left */ + uint32_t remain_time = target_time - elapsed_time; + + /* Smear the remaining entries over the remaining passes. */ + uint32_t nentries = ddt->ddt_flush_min / + (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); + if (want_time > remain_time) { + /* + * We're behind; try to catch up a bit by doubling the amount + * this pass. If we're behind that means we're in a later + * pass and likely have most of the remaining time to + * ourselves. If we're in the last couple of passes, then + * doubling might just take us over the timeout, but probably + * not be much, and it stops us falling behind. If we're + * in the middle passes, there'll be more to do, but it + * might just help us catch up a bit and we'll recalculate on + * the next pass anyway. + */ + nentries = MIN(ddt->ddt_flush_min, nentries*2); + } + + ddt_lightweight_entry_t ddlwe; + uint32_t count = 0; + while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { + ddt_sync_flush_entry(ddt, &ddlwe, + ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); + + /* End this pass if we've synced as much as we need to. */ + if (++count >= nentries) + break; + } + ddt->ddt_flush_count += count; + ddt->ddt_flush_min -= count; + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* We emptied it, so truncate on-disk */ + DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); + ddt_log_truncate(ddt, tx); + /* No more passes needed this txg */ + ddt->ddt_flush_pass = 0; + } else { + /* More to do next time, save checkpoint */ + DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); + ddt_log_checkpoint(ddt, &ddlwe, tx); + } + + ddt_sync_update_stats(ddt, tx); + + return (ddt->ddt_flush_pass == 0); +} + +static inline void +ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) +{ + /* + * If we're not forcing flush, and not being asked to start, then + * there's nothing more to do. + */ + if (txg == 0) { + /* Update requested, are we currently forcing flush? */ + if (ddt->ddt_flush_force_txg == 0) + return; + txg = ddt->ddt_flush_force_txg; + } + + /* + * If either of the logs have entries unflushed entries before + * the wanted txg, set the force txg, otherwise clear it. + */ + + if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) && + ddt->ddt_log_active->ddl_first_txg <= txg) || + (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && + ddt->ddt_log_flushing->ddl_first_txg <= txg)) { + ddt->ddt_flush_force_txg = txg; + return; + } + + /* + * Nothing to flush behind the given txg, so we can clear force flush + * state. + */ + ddt->ddt_flush_force_txg = 0; +} + +static void +ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT(avl_is_empty(&ddt->ddt_tree)); + + /* Don't do any flushing when the pool is ready to shut down */ + if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) + return; + + /* Try to flush some. */ + if (!ddt_sync_flush_log_incremental(ddt, tx)) + /* More to do next time */ + return; + + /* No more flushing this txg, so we can do end-of-txg housekeeping */ + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && + !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { + /* + * No more to flush, and the active list has stuff, so + * try to swap the logs for next time. + */ + if (ddt_log_swap(ddt, tx)) { + DDT_KSTAT_ZERO(ddt, dds_log_active_entries); + DDT_KSTAT_SET(ddt, dds_log_flushing_entries, + avl_numnodes(&ddt->ddt_log_flushing->ddl_tree)); + } + } + + /* If force flush is no longer necessary, turn it off. */ + ddt_flush_force_update_txg(ddt, 0); + + /* + * Update flush rate. This is an exponential weighted moving average of + * the number of entries flushed over recent txgs. + */ + ddt->ddt_log_flush_rate = _ewma( + ddt->ddt_flush_count, ddt->ddt_log_flush_rate, + zfs_dedup_log_flush_flow_rate_txgs); + DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate); + + /* + * Update flush time rate. This is an exponential weighted moving + * average of the total time taken to flush over recent txgs. + */ + ddt->ddt_log_flush_time_rate = _ewma( + ddt->ddt_log_flush_time_rate, + ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), + zfs_dedup_log_flush_flow_rate_txgs); + DDT_KSTAT_SET(ddt, dds_log_flush_time_rate, + ddt->ddt_log_flush_time_rate); +} + +static void +ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx) +{ + uint64_t count = avl_numnodes(&ddt->ddt_tree); + + if (count > 0) { + ddt_log_update_t dlu = {0}; + ddt_log_begin(ddt, count, tx, &dlu); + + ddt_entry_t *dde; + void *cookie = NULL; + ddt_lightweight_entry_t ddlwe; + while ((dde = + avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_log_entry(ddt, &ddlwe, &dlu); + ddt_sync_scan_entry(ddt, &ddlwe, tx); + ddt_free(ddt, dde); + } + + ddt_log_commit(ddt, &dlu); + + DDT_KSTAT_SET(ddt, dds_log_active_entries, + avl_numnodes(&ddt->ddt_log_active->ddl_tree)); /* - * If the class changes, the order that we scan this bp - * changes. If it decreases, we could miss it, so - * scan it right now. (This covers both class changing - * while we are doing ddt_walk(), and when we are - * traversing.) + * Sync the stats for the store objects. Even though we haven't + * modified anything on those objects, they're no longer the + * source of truth for entries that are now in the log, and we + * need the on-disk counts to reflect that, otherwise we'll + * miscount later when importing. */ - if (nclass < oclass) { - dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, dde, tx); + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; + class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) + ddt_object_sync(ddt, type, class, tx); + } } + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; + } + + if (spa_sync_pass(ddt->ddt_spa) == 1) { + /* + * Update ingest rate. This is an exponential weighted moving + * average of the number of entries changed over recent txgs. + * The ramp-up cost shouldn't matter too much because the + * flusher will be trying to take at least the minimum anyway. + */ + ddt->ddt_log_ingest_rate = _ewma( + count, ddt->ddt_log_ingest_rate, + zfs_dedup_log_flush_flow_rate_txgs); + DDT_KSTAT_SET(ddt, dds_log_ingest_rate, + ddt->ddt_log_ingest_rate); } } static void -ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx) { - spa_t *spa = ddt->ddt_spa; - ddt_entry_t *dde; - void *cookie = NULL; - if (avl_numnodes(&ddt->ddt_tree) == 0) return; - ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); + ddt_entry_t *dde; + void *cookie = NULL; + while ((dde = avl_destroy_nodes( + &ddt->ddt_tree, &cookie)) != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + ddt_sync_flush_entry(ddt, &ddlwe, + dde->dde_type, dde->dde_class, tx); + ddt_sync_scan_entry(ddt, &ddlwe, tx); + ddt_free(ddt, dde); + } + + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, + sizeof (ddt->ddt_histogram)); + ddt->ddt_spa->spa_dedup_dspace = ~0ULL; + ddt->ddt_spa->spa_dedup_dsize = ~0ULL; + ddt_sync_update_stats(ddt, tx); +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + + if (ddt->ddt_version == UINT64_MAX) + return; + + if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) { + ASSERT0(avl_numnodes(&ddt->ddt_tree)); + return; + } if (spa->spa_ddt_stat_object == 0) { spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, @@ -1147,31 +2325,13 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) DMU_POOL_DDT_STATS, tx); } - while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { - ddt_sync_entry(ddt, dde, tx, txg); - ddt_free(dde); - } + if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) + ddt_create_dir(ddt, tx); - for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - uint64_t add, count = 0; - for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (ddt_object_exists(ddt, type, class)) { - ddt_object_sync(ddt, type, class, tx); - VERIFY0(ddt_object_count(ddt, type, class, - &add)); - count += add; - } - } - for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (count == 0 && ddt_object_exists(ddt, type, class)) - ddt_object_destroy(ddt, type, class, tx); - } - } - - memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, - sizeof (ddt->ddt_histogram)); - spa->spa_dedup_dspace = ~0ULL; - spa->spa_dedup_dsize = ~0ULL; + if (ddt->ddt_flags & DDT_FLAG_LOG) + ddt_sync_table_log(ddt, tx); + else + ddt_sync_table_flush(ddt, tx); } void @@ -1201,7 +2361,9 @@ ddt_sync(spa_t *spa, uint64_t txg) ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) continue; - ddt_sync_table(ddt, tx, txg); + ddt_sync_table(ddt, tx); + if (ddt->ddt_flags & DDT_FLAG_LOG) + ddt_sync_flush_log(ddt, tx); ddt_repair_table(ddt, rio); } @@ -1211,8 +2373,41 @@ ddt_sync(spa_t *spa, uint64_t txg) dmu_tx_commit(tx); } -int -ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +void +ddt_walk_init(spa_t *spa, uint64_t txg) +{ + if (txg == 0) + txg = spa_syncing_txg(spa); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) + continue; + + ddt_enter(ddt); + ddt_flush_force_update_txg(ddt, txg); + ddt_exit(ddt); + } +} + +boolean_t +ddt_walk_ready(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG)) + continue; + + if (ddt->ddt_flush_force_txg > 0) + return (B_FALSE); + } + + return (B_TRUE); +} + +static int +ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe, + uint64_t flags, boolean_t wait) { do { do { @@ -1220,15 +2415,21 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; if (ddt == NULL) continue; + + if (flags != 0 && + (ddt->ddt_flags & flags) != flags) + continue; + + if (wait && ddt->ddt_flush_force_txg > 0) + return (EAGAIN); + int error = ENOENT; if (ddt_object_exists(ddt, ddb->ddb_type, ddb->ddb_class)) { error = ddt_object_walk(ddt, ddb->ddb_type, ddb->ddb_class, - &ddb->ddb_cursor, dde); + &ddb->ddb_cursor, ddlwe); } - dde->dde_type = ddb->ddb_type; - dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) @@ -1243,13 +2444,19 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) return (SET_ERROR(ENOENT)); } +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +{ + return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE)); +} + /* * This function is used by Block Cloning (brt.c) to increase reference * counter for the DDT entry if the block is already in DDT. * * Return false if the block, despite having the D bit set, is not present - * in the DDT. Currently this is not possible but might be in the future. - * See the comment below. + * in the DDT. This is possible when the DDT has been pruned by an admin + * or by the DDT quota mechanism. */ boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp) @@ -1262,7 +2469,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt = ddt_select(spa, bp); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); /* Can be NULL if the entry for this block was pruned. */ if (dde == NULL) { @@ -1271,37 +2478,23 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (B_FALSE); } - if (dde->dde_type < DDT_TYPES) { - ddt_phys_t *ddp; - - ASSERT3S(dde->dde_class, <, DDT_CLASSES); - - ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; - + if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) { /* - * This entry already existed (dde_type is real), so it must - * have refcnt >0 at the start of this txg. We are called from - * brt_pending_apply(), before frees are issued, so the refcnt - * can't be lowered yet. Therefore, it must be >0. We assert - * this because if the order of BRT and DDT interactions were - * ever to change and the refcnt was ever zero here, then - * likely further action is required to fill out the DDT entry, - * and this is a place that is likely to be missed in testing. + * This entry was either synced to a store object (dde_type is + * real) or was logged. It must be properly on disk at this + * point, so we can just bump its refcount. */ - ASSERT3U(ddp->ddp_refcnt, >, 0); + int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - ddt_phys_addref(ddp); + ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* - * At the time of implementating this if the block has the - * DEDUP flag set it must exist in the DEDUP table, but - * there are many advocates that want ability to remove - * entries from DDT with refcnt=1. If this will happen, - * we may have a block with the DEDUP set, but which doesn't - * have a corresponding entry in the DDT. Be ready. + * If the block has the DEDUP flag set it still might not + * exist in the DEDUP table due to DDT pruning of entries + * where refcnt=1. */ - ASSERT3S(dde->dde_class, ==, DDT_CLASSES); ddt_remove(ddt, dde); result = B_FALSE; } @@ -1312,5 +2505,279 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (result); } +typedef struct ddt_prune_entry { + ddt_t *dpe_ddt; + ddt_key_t dpe_key; + list_node_t dpe_node; + ddt_univ_phys_t dpe_phys[]; +} ddt_prune_entry_t; + +typedef struct ddt_prune_info { + spa_t *dpi_spa; + uint64_t dpi_txg_syncs; + uint64_t dpi_pruned; + list_t dpi_candidates; +} ddt_prune_info_t; + +/* + * Add prune candidates for ddt_sync during spa_sync + */ +static void +prune_candidates_sync(void *arg, dmu_tx_t *tx) +{ + (void) tx; + ddt_prune_info_t *dpi = arg; + ddt_prune_entry_t *dpe; + + spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER); + + /* Process the prune candidates collected so far */ + while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) { + blkptr_t blk; + ddt_t *ddt = dpe->dpe_ddt; + + ddt_enter(ddt); + + /* + * If it's on the live list, then it was loaded for update + * this txg and is no longer stale; skip it. + */ + if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) { + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + continue; + } + + ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, + dpe->dpe_phys, DDT_PHYS_FLAT, &blk); + + ddt_entry_t *dde = ddt_lookup(ddt, &blk); + if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Zero the physical, so we don't try to free DVAs + * at flush nor try to reuse this entry. + */ + ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT); + + dpi->dpi_pruned++; + } + + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + } + + spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG); + dpi->dpi_txg_syncs++; +} + +/* + * Prune candidates are collected in open context and processed + * in sync context as part of ddt_sync_table(). + */ +static void +ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp) +{ + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + + size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE; + ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP); + + dpe->dpe_ddt = ddt; + dpe->dpe_key = *ddk; + memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE); + list_insert_head(list, dpe); +} + +/* + * Interate over all the entries in the DDT unique class. + * The walk will perform one of the following operations: + * (a) build a histogram than can be used when pruning + * (b) prune entries older than the cutoff + * + * Also called by zdb(8) to dump the age histogram + */ +void +ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) +{ + ddt_bookmark_t ddb = { + .ddb_class = DDT_CLASS_UNIQUE, + .ddb_type = 0, + .ddb_checksum = 0, + .ddb_cursor = 0 + }; + ddt_lightweight_entry_t ddlwe = {0}; + int error; + /* + * TBD: Unused variable total + * int total = 0, valid = 0; + */ + int valid = 0; + int candidates = 0; + uint64_t now = gethrestime_sec(); + ddt_prune_info_t dpi; + boolean_t pruning = (cutoff != 0); + + if (pruning) { + dpi.dpi_txg_syncs = 0; + dpi.dpi_pruned = 0; + dpi.dpi_spa = spa; + list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t), + offsetof(ddt_prune_entry_t, dpe_node)); + } + + if (histogram != NULL) + memset(histogram, 0, sizeof (ddt_age_histo_t)); + + while ((error = + ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + VERIFY(ddt); + + if (spa_shutting_down(spa) || issig()) + break; + /* + * TBD: Unused variable total + * total++; + */ + + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1); + + uint64_t class_start = + ddlwe.ddlwe_phys.ddp_flat.ddp_class_start; + + /* + * If this entry is on the log, then the stored entry is stale + * and we should skip it. + */ + if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL)) + continue; + + /* prune older entries */ + if (pruning && class_start < cutoff) { + if (candidates++ >= zfs_ddt_prunes_per_txg) { + /* sync prune candidates in batches */ + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, prune_candidates_sync, + &dpi, 0, ZFS_SPACE_CHECK_NONE)); + candidates = 1; + } + ddt_prune_entry(&dpi.dpi_candidates, ddt, + &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys); + } + + /* build a histogram */ + if (histogram != NULL) { + uint64_t age = MAX(1, (now - class_start) / 3600); + int bin = MIN(highbit64(age) - 1, HIST_BINS - 1); + histogram->dah_entries++; + histogram->dah_age_histo[bin]++; + } + + valid++; + } + + if (pruning && valid > 0) { + if (!list_is_empty(&dpi.dpi_candidates)) { + /* sync out final batch of prune candidates */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + prune_candidates_sync, &dpi, 0, + ZFS_SPACE_CHECK_NONE)); + } + list_destroy(&dpi.dpi_candidates); + + zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs", + (u_longlong_t)dpi.dpi_pruned, + (int)((dpi.dpi_pruned * 100) / valid), + (u_longlong_t)dpi.dpi_txg_syncs); + } +} + +static uint64_t +ddt_total_entries(spa_t *spa) +{ + ddt_object_t ddo; + ddt_get_dedup_object_stats(spa, &ddo); + + return (ddo.ddo_count); +} + +int +ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + uint64_t cutoff; + uint64_t start_time = gethrtime(); + + if (spa->spa_active_ddt_prune) + return (SET_ERROR(EALREADY)); + if (ddt_total_entries(spa) == 0) + return (0); + + spa->spa_active_ddt_prune = B_TRUE; + + zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount, + unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older"); + + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + ddt_age_histo_t histogram; + uint64_t oldest = 0; + + /* Make a pass over DDT to build a histogram */ + ddt_prune_walk(spa, 0, &histogram); + + int target = (histogram.dah_entries * amount) / 100; + + /* + * Figure out our cutoff date + * (i.e., which bins to prune from) + */ + for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) { + if (histogram.dah_age_histo[i] != 0) { + /* less than this bucket remaining */ + if (target < histogram.dah_age_histo[i]) { + oldest = MAX(1, (1< 0 && !spa_shutting_down(spa) && !issig()) { + /* Traverse DDT to prune entries older that our cuttoff */ + ddt_prune_walk(spa, cutoff, NULL); + } + + zfs_dbgmsg("%s: prune completed in %llu ms", + spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + + spa->spa_active_ddt_prune = B_FALSE; + return (0); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, + "Max number of incremental dedup log flush passes per transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, + "Min time to spend on incremental dedup log flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, + "Min number of log entries to flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, + "Number of txgs to average flow rates across"); diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c new file mode 100644 index 000000000000..3aa07dc25b91 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/ddt_log.c @@ -0,0 +1,778 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023, Klara Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * No more than this many txgs before swapping logs. + */ +uint_t zfs_dedup_log_txg_max = 8; + +/* + * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module + * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. + */ +uint64_t zfs_dedup_log_mem_max = 0; +uint_t zfs_dedup_log_mem_max_percent = 1; + + +static kmem_cache_t *ddt_log_entry_flat_cache; +static kmem_cache_t *ddt_log_entry_trad_cache; + +#define DDT_LOG_ENTRY_FLAT_SIZE \ + (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_LOG_ENTRY_TRAD_SIZE \ + (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_LOG_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) + +void +ddt_log_init(void) +{ + ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", + DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", + DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * Max memory for log AVL entries. At least 1M, because we need + * something (that's ~3800 entries per tree). They can say 100% if they + * want; it just means they're at the mercy of the the txg flush limit. + */ + if (zfs_dedup_log_mem_max == 0) { + zfs_dedup_log_mem_max_percent = + MIN(zfs_dedup_log_mem_max_percent, 100); + zfs_dedup_log_mem_max = (physmem * PAGESIZE) * + zfs_dedup_log_mem_max_percent / 100; + } + zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); +} + +void +ddt_log_fini(void) +{ + kmem_cache_destroy(ddt_log_entry_trad_cache); + kmem_cache_destroy(ddt_log_entry_flat_cache); +} + +static void +ddt_log_name(ddt_t *ddt, char *name, uint_t n) +{ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, + zio_checksum_table[ddt->ddt_checksum].ci_name, n); +} + +static void +ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) +{ + dmu_buf_t *db; + VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; + DLH_SET_VERSION(hdr, 1); + DLH_SET_FLAGS(hdr, ddl->ddl_flags); + hdr->dlh_length = ddl->ddl_length; + hdr->dlh_first_txg = ddl->ddl_first_txg; + hdr->dlh_checkpoint = ddl->ddl_checkpoint; + + dmu_buf_rele(db, FTAG); +} + +static void +ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, >, 0); + ASSERT3U(ddl->ddl_object, ==, 0); + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, + sizeof (uint64_t), 1, &ddl->ddl_object, tx)); + ddl->ddl_length = 0; + ddl->ddl_first_txg = tx->tx_txg; + ddt_log_update_header(ddt, ddl, tx); +} + +static void +ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); + ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); +} + +static void +ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, >, 0); + + if (ddl->ddl_object == 0) + return; + + ASSERT0(ddl->ddl_length); + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); + VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); + + ddl->ddl_object = 0; +} + +void +ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); + ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); +} + +static void +ddt_log_update_stats(ddt_t *ddt) +{ + /* + * Log object stats. We count the number of live entries in the log + * tree, even if there are more than on disk, and even if the same + * entry is on both append and flush trees, because that's more what + * the user expects to see. This does mean the on-disk size is not + * really correlated with the number of entries, but I don't think + * that's reasonable to expect anyway. + */ + dmu_object_info_t doi; + uint64_t nblocks; + dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); + nblocks = doi.doi_physical_blocks_512; + dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); + nblocks += doi.doi_physical_blocks_512; + + ddt_object_t *ddo = &ddt->ddt_log_stats; + ddo->ddo_count = + avl_numnodes(&ddt->ddt_log_active->ddl_tree) + + avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); + ddo->ddo_dspace = nblocks << 9; +} + +void +ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) +{ + ASSERT3U(nentries, >, 0); + ASSERT3P(dlu->dlu_dbp, ==, NULL); + + if (ddt->ddt_log_active->ddl_object == 0) + ddt_log_create(ddt, tx); + + /* + * We want to store as many entries as we can in a block, but never + * split an entry across block boundaries. + */ + size_t reclen = P2ALIGN_TYPED( + sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + + DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); + ASSERT3U(reclen, <=, UINT16_MAX); + dlu->dlu_reclen = reclen; + + VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, + &dlu->dlu_dn)); + dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); + + uint64_t nblocks = howmany(nentries, + dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); + uint64_t offset = ddt->ddt_log_active->ddl_length; + uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; + + VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, + B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, + DMU_READ_NO_PREFETCH)); + + dlu->dlu_tx = tx; + dlu->dlu_block = dlu->dlu_offset = 0; +} + +static ddt_log_entry_t * +ddt_log_alloc_entry(ddt_t *ddt) +{ + ddt_log_entry_t *ddle; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); + memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); + } else { + ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); + memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); + } + + return (ddle); +} + +static void +ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +{ + /* Create the log tree entry from a live or stored entry */ + avl_index_t where; + ddt_log_entry_t *ddle = + avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); + if (ddle == NULL) { + ddle = ddt_log_alloc_entry(ddt); + ddle->ddle_key = ddlwe->ddlwe_key; + avl_insert(&ddl->ddl_tree, ddle, where); + } + ddle->ddle_type = ddlwe->ddlwe_type; + ddle->ddle_class = ddlwe->ddlwe_class; + memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); +} + +void +ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) +{ + ASSERT3U(dlu->dlu_dbp, !=, NULL); + + ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + /* Get our block */ + ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); + dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; + + /* + * If this would take us past the end of the block, finish it and + * move to the next one. + */ + if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { + ASSERT3U(dlu->dlu_offset, >, 0); + dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); + dlu->dlu_block++; + dlu->dlu_offset = 0; + ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); + db = dlu->dlu_dbp[dlu->dlu_block]; + } + + /* + * If this is the first time touching the block, inform the DMU that + * we will fill it, and zero it out. + */ + if (dlu->dlu_offset == 0) { + dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE); + memset(db->db_data, 0, db->db_size); + } + + /* Create the log record directly in the buffer */ + ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); + DLR_SET_TYPE(dlr, DLR_ENTRY); + DLR_SET_RECLEN(dlr, dlu->dlu_reclen); + DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); + DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); + + ddt_log_record_entry_t *dlre = + (ddt_log_record_entry_t *)&dlr->dlr_payload; + dlre->dlre_key = ddlwe->ddlwe_key; + memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); + + /* Advance offset for next record. */ + dlu->dlu_offset += dlu->dlu_reclen; +} + +void +ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) +{ + ASSERT3U(dlu->dlu_dbp, !=, NULL); + ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); + ASSERT3U(dlu->dlu_offset, >, 0); + + /* + * Close out the last block. Whatever we haven't used will be zeroed, + * which matches DLR_INVALID, so we can detect this during load. + */ + dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); + + dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); + + ddt->ddt_log_active->ddl_length += + dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; + dnode_rele(dlu->dlu_dn, FTAG); + + ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); + + memset(dlu, 0, sizeof (ddt_log_update_t)); + + ddt_log_update_stats(ddt); +} + +boolean_t +ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + if (ddle == NULL) + return (B_FALSE); + + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + + avl_remove(&ddl->ddl_tree, ddle); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + + return (B_TRUE); +} + +boolean_t +ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) +{ + ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); + if (ddle == NULL) + return (B_FALSE); + + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + + avl_remove(&ddl->ddl_tree, ddle); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + + return (B_TRUE); +} + +boolean_t +ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = + avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); + if (!ddle) + ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); + if (!ddle) + return (B_FALSE); + if (ddlwe) + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + return (B_TRUE); +} + +void +ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) +{ + ddt_log_t *ddl = ddt->ddt_log_flushing; + + ASSERT3U(ddl->ddl_object, !=, 0); + +#ifdef ZFS_DEBUG + /* + * There should not be any entries on the log tree before the given + * checkpoint. Assert that this is the case. + */ + ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); + if (ddle != NULL) + VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), + >, 0); +#endif + + ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; + ddl->ddl_checkpoint = ddlwe->ddlwe_key; + ddt_log_update_header(ddt, ddl, tx); + + ddt_log_update_stats(ddt); +} + +void +ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) +{ + ddt_log_t *ddl = ddt->ddt_log_flushing; + + if (ddl->ddl_object == 0) + return; + + ASSERT(avl_is_empty(&ddl->ddl_tree)); + + /* Eject the entire object */ + dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); + + ddl->ddl_length = 0; + ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; + memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); + ddt_log_update_header(ddt, ddl, tx); + + ddt_log_update_stats(ddt); +} + +boolean_t +ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) +{ + /* Swap the logs. The old flushing one must be empty */ + VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); + + /* + * If there are still blocks on the flushing log, truncate it first. + * This can happen if there were entries on the flushing log that were + * removed in memory via ddt_lookup(); their vestigal remains are + * on disk. + */ + if (ddt->ddt_log_flushing->ddl_length > 0) + ddt_log_truncate(ddt, tx); + + /* + * Swap policy. We swap the logs (and so begin flushing) when the + * active tree grows too large, or when we haven't swapped it in + * some amount of time, or if something has requested the logs be + * flushed ASAP (see ddt_walk_init()). + */ + + /* + * The log tree is too large if the memory usage of its entries is over + * half of the memory limit. This effectively gives each log tree half + * the available memory. + */ + const boolean_t too_large = + (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * + DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); + + const boolean_t too_old = + tx->tx_txg >= + (ddt->ddt_log_active->ddl_first_txg + + MAX(1, zfs_dedup_log_txg_max)); + + const boolean_t force = + ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg; + + if (!(too_large || too_old || force)) + return (B_FALSE); + + ddt_log_t *swap = ddt->ddt_log_active; + ddt->ddt_log_active = ddt->ddt_log_flushing; + ddt->ddt_log_flushing = swap; + + ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); + ddt->ddt_log_active->ddl_flags &= + ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); + + ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); + ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; + + ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; + + ddt_log_update_header(ddt, ddt->ddt_log_active, tx); + ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); + + ddt_log_update_stats(ddt); + + return (B_TRUE); +} + +static inline void +ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, + const ddt_key_t *checkpoint) +{ + ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); + + ddt_log_record_entry_t *dlre = + (ddt_log_record_entry_t *)dlr->dlr_payload; + if (checkpoint != NULL && + ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { + /* Skip pre-checkpoint entries; they're already flushed. */ + return; + } + + ddt_lightweight_entry_t ddlwe; + ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); + ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); + + ddlwe.ddlwe_key = dlre->dlre_key; + memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); + + ddt_log_update_entry(ddt, ddl, &ddlwe); +} + +static void +ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) +{ + void *cookie = NULL; + ddt_log_entry_t *ddle; + IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); + while ((ddle = + avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + } + ASSERT(avl_is_empty(&ddl->ddl_tree)); +} + +static int +ddt_log_load_one(ddt_t *ddt, uint_t n) +{ + ASSERT3U(n, <, 2); + + ddt_log_t *ddl = &ddt->ddt_log[n]; + + char name[DDT_NAMELEN]; + ddt_log_name(ddt, name, n); + + uint64_t obj; + int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, + sizeof (uint64_t), 1, &obj); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); + + dnode_t *dn; + err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); + if (err != 0) + return (err); + + ddt_log_header_t hdr; + dmu_buf_t *db; + err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, FTAG); + return (err); + } + memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); + dmu_buf_rele(db, FTAG); + + if (DLH_GET_VERSION(&hdr) != 1) { + dnode_rele(dn, FTAG); + zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " + "unknown version=%llu", spa_name(ddt->ddt_spa), name, + (u_longlong_t)DLH_GET_VERSION(&hdr)); + return (SET_ERROR(EINVAL)); + } + + ddt_key_t *checkpoint = NULL; + if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { + /* + * If the log has a checkpoint, then we can ignore any entries + * that have already been flushed. + */ + ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); + checkpoint = &hdr.dlh_checkpoint; + } + + if (hdr.dlh_length > 0) { + dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, + ZIO_PRIORITY_SYNC_READ); + + for (uint64_t offset = 0; offset < hdr.dlh_length; + offset += dn->dn_datablksz) { + err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, + DMU_READ_PREFETCH); + if (err != 0) { + dnode_rele(dn, FTAG); + ddt_log_empty(ddt, ddl); + return (err); + } + + uint64_t boffset = 0; + while (boffset < db->db_size) { + ddt_log_record_t *dlr = + (ddt_log_record_t *)(db->db_data + boffset); + + /* Partially-filled block, skip the rest */ + if (DLR_GET_TYPE(dlr) == DLR_INVALID) + break; + + switch (DLR_GET_TYPE(dlr)) { + case DLR_ENTRY: + ddt_log_load_entry(ddt, ddl, dlr, + checkpoint); + break; + + default: + dmu_buf_rele(db, FTAG); + dnode_rele(dn, FTAG); + ddt_log_empty(ddt, ddl); + return (SET_ERROR(EINVAL)); + } + + boffset += DLR_GET_RECLEN(dlr); + } + + dmu_buf_rele(db, FTAG); + } + } + + dnode_rele(dn, FTAG); + + ddl->ddl_object = obj; + ddl->ddl_flags = DLH_GET_FLAGS(&hdr); + ddl->ddl_length = hdr.dlh_length; + ddl->ddl_first_txg = hdr.dlh_first_txg; + + if (ddl->ddl_flags & DDL_FLAG_FLUSHING) + ddt->ddt_log_flushing = ddl; + else + ddt->ddt_log_active = ddl; + + return (0); +} + +int +ddt_log_load(ddt_t *ddt) +{ + int err; + + if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { + /* + * The DDT is going to be freed again in a moment, so there's + * no point loading the log; it'll just slow down import. + */ + return (0); + } + + ASSERT0(ddt->ddt_log[0].ddl_object); + ASSERT0(ddt->ddt_log[1].ddl_object); + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then the log object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); + return (SET_ERROR(ENOENT)); + } + + if ((err = ddt_log_load_one(ddt, 0)) != 0) + return (err); + if ((err = ddt_log_load_one(ddt, 1)) != 0) + return (err); + + VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); + VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); + VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); + VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); + + /* + * We have two finalisation tasks: + * + * - rebuild the histogram. We do this at the end rather than while + * we're loading so we don't need to uncount and recount entries that + * appear multiple times in the log. + * + * - remove entries from the flushing tree that are on both trees. This + * happens when ddt_lookup() rehydrates an entry from the flushing + * tree, as ddt_log_take_key() removes the entry from the in-memory + * tree but doesn't remove it from disk. + */ + + /* + * We don't technically need a config lock here, since there shouldn't + * be pool config changes during DDT load. dva_get_dsize_sync() via + * ddt_stat_generate() is expecting it though, and it won't hurt + * anything, so we take it. + */ + spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); + + avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; + avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; + ddt_log_entry_t *ae = avl_first(al); + ddt_log_entry_t *fe = avl_first(fl); + while (ae != NULL || fe != NULL) { + ddt_log_entry_t *ddle; + if (ae == NULL) { + /* active exhausted, take flushing */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + } else if (fe == NULL) { + /* flushing exuhausted, take active */ + ddle = ae; + ae = AVL_NEXT(al, ae); + } else { + /* compare active and flushing */ + int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); + if (c < 0) { + /* active behind, take and advance */ + ddle = ae; + ae = AVL_NEXT(al, ae); + } else if (c > 0) { + /* flushing behind, take and advance */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + } else { + /* match. remove from flushing, take active */ + ddle = fe; + fe = AVL_NEXT(fl, fe); + avl_remove(fl, ddle); + + ddle = ae; + ae = AVL_NEXT(al, ae); + } + } + + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); + } + + spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); + + ddt_log_update_stats(ddt); + + return (0); +} + +void +ddt_log_alloc(ddt_t *ddt) +{ + ASSERT3P(ddt->ddt_log_active, ==, NULL); + ASSERT3P(ddt->ddt_log_flushing, ==, NULL); + + avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, + sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); + avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, + sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); + ddt->ddt_log_active = &ddt->ddt_log[0]; + ddt->ddt_log_flushing = &ddt->ddt_log[1]; + ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; +} + +void +ddt_log_free(ddt_t *ddt) +{ + ddt_log_empty(ddt, &ddt->ddt_log[0]); + ddt_log_empty(ddt, &ddt->ddt_log[1]); + avl_destroy(&ddt->ddt_log[0].ddl_tree); + avl_destroy(&ddt->ddt_log[1].ddl_tree); +} + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, + "Max transactions before starting to flush dedup logs"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, + "Max memory for dedup logs"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, + "Max memory for dedup logs, as % of total memory"); diff --git a/sys/contrib/openzfs/module/zfs/ddt_stats.c b/sys/contrib/openzfs/module/zfs/ddt_stats.c index 82b682019ae9..8f55bc24f0f5 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_stats.c +++ b/sys/contrib/openzfs/module/zfs/ddt_stats.c @@ -33,27 +33,32 @@ #include static void -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, + ddt_stat_t *dds) { spa_t *spa = ddt->ddt_spa; - ddt_phys_t *ddp = dde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - uint64_t lsize = DDK_GET_LSIZE(ddk); - uint64_t psize = DDK_GET_PSIZE(ddk); + uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key); + uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key); memset(dds, 0, sizeof (*dds)); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - if (ddp->ddp_phys_birth == 0) + if (ddt_phys_birth(ddp, v) == 0) continue; - int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? - SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + int ndvas = ddt_phys_dva_count(ddp, v, + DDK_GET_CRYPT(&ddlwe->ddlwe_key)); + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva; + + uint64_t dsize = 0; for (int d = 0; d < ndvas; d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + dsize += dva_get_dsize_sync(spa, &dvas[d]); + + uint64_t refcnt = ddt_phys_refcnt(ddp, v); dds->dds_blocks += 1; dds->dds_lsize += lsize; @@ -67,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) } } -void -ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +static void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src) { - const uint64_t *s = (const uint64_t *)src; - uint64_t *d = (uint64_t *)dst; - uint64_t *d_end = (uint64_t *)(dst + 1); + dst->dds_blocks += src->dds_blocks; + dst->dds_lsize += src->dds_lsize; + dst->dds_psize += src->dds_psize; + dst->dds_dsize += src->dds_dsize; + dst->dds_ref_blocks += src->dds_ref_blocks; + dst->dds_ref_lsize += src->dds_ref_lsize; + dst->dds_ref_psize += src->dds_ref_psize; + dst->dds_ref_dsize += src->dds_ref_dsize; +} - ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ +static void +ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src) +{ + /* This caught more during development than you might expect... */ + ASSERT3U(dst->dds_blocks, >=, src->dds_blocks); + ASSERT3U(dst->dds_lsize, >=, src->dds_lsize); + ASSERT3U(dst->dds_psize, >=, src->dds_psize); + ASSERT3U(dst->dds_dsize, >=, src->dds_dsize); + ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks); + ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize); + ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize); + ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize); - for (int i = 0; i < d_end - d; i++) - d[i] += (s[i] ^ neg) - neg; + dst->dds_blocks -= src->dds_blocks; + dst->dds_lsize -= src->dds_lsize; + dst->dds_psize -= src->dds_psize; + dst->dds_dsize -= src->dds_dsize; + dst->dds_ref_blocks -= src->dds_ref_blocks; + dst->dds_ref_lsize -= src->dds_ref_lsize; + dst->dds_ref_psize -= src->dds_ref_psize; + dst->dds_ref_dsize -= src->dds_ref_dsize; } void -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe) { ddt_stat_t dds; - ddt_histogram_t *ddh; int bucket; - ddt_stat_generate(ddt, dde, &dds); + ddt_stat_generate(ddt, ddlwe, &dds); bucket = highbit64(dds.dds_ref_blocks) - 1; - ASSERT3U(bucket, >=, 0); + if (bucket < 0) + return; - ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + ddt_stat_add(&ddh->ddh_stat[bucket], &dds); +} - ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +void +ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh, + const ddt_lightweight_entry_t *ddlwe) +{ + ddt_stat_t dds; + int bucket; + + ddt_stat_generate(ddt, ddlwe, &dds); + + bucket = highbit64(dds.dds_ref_blocks) - 1; + if (bucket < 0) + return; + + ddt_stat_sub(&ddh->ddh_stat[bucket], &dds); } void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) { for (int h = 0; h < 64; h++) - ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]); } void -ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh) { memset(dds, 0, sizeof (*dds)); for (int h = 0; h < 64; h++) - ddt_stat_add(dds, &ddh->ddh_stat[h], 0); + ddt_stat_add(dds, &ddh->ddh_stat[h]); } boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh) { - const uint64_t *s = (const uint64_t *)ddh; - const uint64_t *s_end = (const uint64_t *)(ddh + 1); + for (int h = 0; h < 64; h++) { + const ddt_stat_t *dds = &ddh->ddh_stat[h]; - while (s < s_end) - if (*s++ != 0) - return (B_FALSE); + if (dds->dds_blocks == 0 && + dds->dds_lsize == 0 && + dds->dds_psize == 0 && + dds->dds_dsize == 0 && + dds->dds_ref_blocks == 0 && + dds->dds_ref_lsize == 0 && + dds->dds_ref_psize == 0 && + dds->dds_ref_dsize == 0) + continue; + + return (B_FALSE); + } return (B_TRUE); } @@ -170,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) ddo_total->ddo_mspace += ddo->ddo_mspace; } } + + ddt_object_t *ddo = &ddt->ddt_log_stats; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; } /* @@ -207,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) &ddt->ddt_histogram_cache[type][class]); } } + + ddt_histogram_add(ddh, &ddt->ddt_log_histogram); } } @@ -217,7 +276,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh_total); - ddt_histogram_stat(dds_total, ddh_total); + ddt_histogram_total(dds_total, ddh_total); kmem_free(ddh_total, sizeof (ddt_histogram_t)); } diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c index 7ce7461a2b25..d96dc505cdea 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_zap.c +++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #include @@ -51,8 +52,13 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len) ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ - c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1, - ci->ci_level); + /* Call compress function directly to avoid hole detection. */ + abd_t sabd, dabd; + abd_get_from_buf_struct(&sabd, (void *)src, s_len); + abd_get_from_buf_struct(&dabd, dst, d_len); + c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level); + abd_free(&dabd); + abd_free(&sabd); if (c_len == s_len) { cpfunc = ZIO_COMPRESS_OFF; @@ -71,12 +77,18 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) { uchar_t version = *src++; int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - if (ci->ci_decompress != NULL) - (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); - else + if (zio_compress_table[cpfunc].ci_decompress == NULL) { memcpy(dst, src, d_len); + return; + } + + abd_t sabd, dabd; + abd_get_from_buf_struct(&sabd, src, s_len); + abd_get_from_buf_struct(&dabd, dst, d_len); + VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL)); + abd_free(&dabd); + abd_free(&sabd); if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) != (ZFS_HOST_BYTEORDER != 0)) @@ -108,7 +120,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) static int ddt_zap_lookup(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) + const ddt_key_t *ddk, void *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; @@ -155,7 +167,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object) static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, - const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) + const void *phys, size_t psize, dmu_tx_t *tx) { const size_t cbuf_size = psize + 1; @@ -181,7 +193,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, static int ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, - ddt_phys_t *phys, size_t psize) + void *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t za; diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 3dcf49ceb64e..b3eda8ea5097 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024; uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; #endif +/* + * Override copies= for dedup state objects. 0 means the traditional behaviour + * (ie the default for the containing objset ie 3 for the MOS). + */ +uint_t dmu_ddt_copies = 0; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, @@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) case ZFS_REDUNDANT_METADATA_NONE: break; } + + if (dmu_ddt_copies > 0) { + /* + * If this tuneable is set, and this is a write for a + * dedup entry store (zap or log), then we treat it + * something like ZFS_REDUNDANT_METADATA_MOST on a + * regular dataset: this many copies, and one more for + * "higher" indirect blocks. This specific exception is + * necessary because dedup objects are stored in the + * MOS, which always has the highest possible copies. + */ + dmu_object_type_t stype = + dn ? dn->dn_storage_type : DMU_OT_NONE; + if (stype == DMU_OT_NONE) + stype = type; + if (stype == DMU_OT_DDT_ZAP) { + copies = dmu_ddt_copies; + if (level >= + zfs_redundant_metadata_most_ditto_level) + copies++; + } + } } else if (wp & WP_NOFILL) { ASSERT(level == 0); @@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, /* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, "Limit one prefetch call to this size"); + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW, + "Override copies= for dedup objects"); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 0119191d7920..a1752650f3ba 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, abd_t *dabd = abd_alloc_linear( drrw->drr_logical_size, B_FALSE); err = zio_decompress_data(drrw->drr_compressiontype, - abd, abd_to_buf(dabd), abd_get_size(abd), + abd, dabd, abd_get_size(abd), abd_get_size(dabd), NULL); if (err != 0) { @@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, /* Recompress the data */ abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); - void *buf = abd_to_buf(cabd); uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), - abd, &buf, abd_get_size(abd), + abd, &cabd, abd_get_size(abd), rwa->os->os_complevel); abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); /* Swap in newly compressed data into the abd */ @@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) err = zio_decompress_data( drrw->drr_compressiontype, - abd, abd_to_buf(decomp_abd), + abd, decomp_abd, abd_get_size(abd), abd_get_size(decomp_abd), NULL); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 45d8a290d67d..042725b235d0 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -2425,8 +2425,14 @@ get_receive_resume_token_impl(dsl_dataset_t *ds) fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); - compressed_size = gzip_compress(packed, compressed, + /* Call compress function directly to avoid hole detection. */ + abd_t pabd, cabd; + abd_get_from_buf_struct(&pabd, packed, packed_size); + abd_get_from_buf_struct(&cabd, compressed, packed_size); + compressed_size = zfs_gzip_compress(&pabd, &cabd, packed_size, packed_size, 6); + abd_free(&cabd); + abd_free(&pabd); zio_cksum_t cksum; fletcher_4_native_varsize(compressed, compressed_size, &cksum); diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 085cfd3c5691..9d040e146308 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) zap_cursor_fini(&zc); } + ddt_walk_init(spa, scn->scn_phys.scn_max_txg); + spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); @@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); + ddt_walk_init(spa, scn->scn_phys.scn_max_txg); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, @@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || - (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) || + !ddt_walk_ready(scn->scn_dp->dp_spa)) { if (zb && zb->zb_level == ZB_ROOT_LEVEL) { dprintf("suspending at first available bookmark " "%llx/%llx/%llx/%llx\n", @@ -2929,11 +2934,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx) + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { (void) tx; - const ddt_key_t *ddk = &dde->dde_key; - ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &ddlwe->ddlwe_key; blkptr_t bp; zbookmark_phys_t zb = { 0 }; @@ -2954,11 +2958,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, if (scn->scn_done_txg != 0) return; - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v); + + if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg) continue; - ddt_bp_create(checksum, ddk, ddp, &bp); + ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); @@ -3002,11 +3008,11 @@ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; - ddt_entry_t dde = {{{{0}}}}; + ddt_lightweight_entry_t ddlwe = {0}; int error; uint64_t n = 0; - while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) @@ -3021,16 +3027,28 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); - dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) break; } - zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; " - "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, - (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); + if (error == EAGAIN) { + dsl_scan_check_suspend(scn, NULL); + error = 0; + + zfs_dbgmsg("waiting for ddt to become ready for scan " + "on %s with class_max = %u; suspending=%u", + scn->scn_dp->dp_spa->spa_name, + (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_suspending); + } else + zfs_dbgmsg("scanned %llu ddt entries on %s with " + "class_max = %u; suspending=%u", (longlong_t)n, + scn->scn_dp->dp_spa->spa_name, + (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c index f3b19446352a..e7fd6f63c4be 100644 --- a/sys/contrib/openzfs/module/zfs/gzip.c +++ b/sys/contrib/openzfs/module/zfs/gzip.c @@ -47,8 +47,9 @@ typedef uLongf zlen_t; #endif -size_t -gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +static size_t +zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) { int ret; zlen_t dstlen = d_len; @@ -82,8 +83,9 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) return ((size_t)dstlen); } -int -gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +static int +zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) { (void) n; zlen_t dstlen = d_len; @@ -103,3 +105,6 @@ gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) return (0); } + +ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress) +ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress) diff --git a/sys/contrib/openzfs/module/zfs/lz4_zfs.c b/sys/contrib/openzfs/module/zfs/lz4_zfs.c index de90c45f2f07..377373bb9c08 100644 --- a/sys/contrib/openzfs/module/zfs/lz4_zfs.c +++ b/sys/contrib/openzfs/module/zfs/lz4_zfs.c @@ -53,8 +53,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, static void *lz4_alloc(int flags); static void lz4_free(void *ctx); -size_t -lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, +static size_t +zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { (void) n; @@ -81,8 +81,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, return (bufsiz + sizeof (bufsiz)); } -int -lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, +static int +zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { (void) n; @@ -101,6 +101,9 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, d_start, bufsiz, d_len) < 0); } +ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress) +ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress) + /* * LZ4 API Description: * diff --git a/sys/contrib/openzfs/module/zfs/lzjb.c b/sys/contrib/openzfs/module/zfs/lzjb.c index a24f17e0fe74..2db549b1626f 100644 --- a/sys/contrib/openzfs/module/zfs/lzjb.c +++ b/sys/contrib/openzfs/module/zfs/lzjb.c @@ -45,8 +45,9 @@ #define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) #define LEMPEL_SIZE 1024 -size_t -lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +static size_t +zfs_lzjb_compress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) { (void) n; uchar_t *src = s_start; @@ -100,8 +101,9 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) return (dst - (uchar_t *)d_start); } -int -lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +static int +zfs_lzjb_decompress_buf(void *s_start, void *d_start, + size_t s_len, size_t d_len, int n) { (void) s_len, (void) n; uchar_t *src = s_start; @@ -130,3 +132,6 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) } return (0); } + +ZFS_COMPRESS_WRAP_DECL(zfs_lzjb_compress) +ZFS_DECOMPRESS_WRAP_DECL(zfs_lzjb_decompress) diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 7a3dd29769ca..1a68a0953565 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -1026,16 +1026,34 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx) * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. + * + * The GUID of the pool will be changed to the value pointed to by guidp. + * The GUID may not be set to the reserverd value of 0. + * The new GUID will be generated if guidp is NULL. */ int -spa_change_guid(spa_t *spa) +spa_change_guid(spa_t *spa, const uint64_t *guidp) { - int error; uint64_t guid; + int error; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); - guid = spa_generate_guid(NULL); + + if (guidp != NULL) { + guid = *guidp; + if (guid == 0) { + error = SET_ERROR(EINVAL); + goto out; + } + + if (spa_guid_exists(guid, 0)) { + error = SET_ERROR(EEXIST); + goto out; + } + } else { + guid = spa_generate_guid(NULL); + } error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); @@ -1054,6 +1072,7 @@ spa_change_guid(spa_t *spa) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } +out: mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); @@ -7588,8 +7607,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * The new device cannot have a higher alignment requirement * than the top-level vdev. */ - if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { + return (spa_vdev_exit(spa, newrootvd, txg, + ZFS_ERR_ASHIFT_MISMATCH)); + } /* * RAIDZ-expansion-specific checks. diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index 2f43c4aa41b8..f7cecc9af8a4 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -645,7 +645,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, DATA_TYPE_INT32, zio->io_error, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, - DATA_TYPE_INT32, zio->io_flags, NULL); + DATA_TYPE_UINT64, zio->io_flags, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, DATA_TYPE_UINT32, zio->io_stage, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 3e2fb73b11ed..53366ad49781 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -1794,17 +1794,45 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_nvlist_src nvlist optionally containing ZPOOL_REGUID_GUID + * zc_nvlist_src_size size of the nvlist + */ static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { + uint64_t *guidp = NULL; + nvlist_t *props = NULL; spa_t *spa; + uint64_t guid; int error; + if (zc->zc_nvlist_src_size != 0) { + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props); + if (error != 0) + return (error); + + error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid); + if (error == 0) + guidp = &guid; + else if (error == ENOENT) + guidp = NULL; + else + goto out; + } + error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { - error = spa_change_guid(spa); + error = spa_change_guid(spa, guidp); spa_close(spa, FTAG); } + +out: + if (props != NULL) + nvlist_free(props); + return (error); } @@ -4314,6 +4342,51 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + +/* + * innvl: { + * "ddt_prune_unit" -> uint32_t + * "ddt_prune_amount" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_ddt_prune[] = { + {DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0}, + {DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t unit; + uint64_t amount; + + if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 || + nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) { + return (EINVAL); + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit, + amount); + + spa_close(spa, FTAG); + + return (error); +} + /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the @@ -7402,6 +7475,11 @@ zfs_ioctl_init(void) POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE, + zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 6d08d4bd1633..e4ccd144f091 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -299,10 +299,13 @@ zio_fini(void) * ========================================================================== */ -#ifdef ZFS_DEBUG -static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; +#if defined(ZFS_DEBUG) && defined(_KERNEL) +#define ZFS_ZIO_BUF_CANARY 1 #endif +#ifdef ZFS_ZIO_BUF_CANARY +static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; + /* * Use empty space after the buffer to detect overflows. * @@ -314,7 +317,6 @@ static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; static void zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { -#ifdef ZFS_DEBUG size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; @@ -323,13 +325,11 @@ zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) *canary = zio_buf_canary; -#endif } static void zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { -#ifdef ZFS_DEBUG size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; @@ -343,8 +343,8 @@ zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) *canary, zio_buf_canary); } } -#endif } +#endif /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a @@ -363,7 +363,9 @@ zio_buf_alloc(size_t size) #endif void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); +#ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_buf_cache, c); +#endif return (p); } @@ -381,7 +383,9 @@ zio_data_buf_alloc(size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); +#ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_data_buf_cache, c); +#endif return (p); } @@ -395,7 +399,9 @@ zio_buf_free(void *buf, size_t size) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif +#ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_buf_cache, c); +#endif kmem_cache_free(zio_buf_cache[c], buf); } @@ -406,7 +412,9 @@ zio_data_buf_free(void *buf, size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_data_buf_cache, c); +#endif kmem_cache_free(zio_data_buf_cache[c], buf); } @@ -479,11 +487,9 @@ static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { - void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_abd, tmp, zio->io_size, size, + zio->io_abd, data, zio->io_size, size, &zio->io_prop.zp_complevel); - abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); @@ -530,17 +536,18 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) * from the indirect block. We decompress it now and * throw away the result after we are finished. */ - tmp = zio_buf_alloc(lsize); + abd_t *abd = abd_alloc_linear(lsize, B_TRUE); ret = zio_decompress_data(BP_GET_COMPRESS(bp), - zio->io_abd, tmp, zio->io_size, lsize, + zio->io_abd, abd, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { + abd_free(abd); ret = SET_ERROR(EIO); goto error; } - ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, - tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); - zio_buf_free(tmp, lsize); + ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, + abd, lsize, BP_SHOULD_BYTESWAP(bp), mac); + abd_free(abd); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); @@ -1858,30 +1865,32 @@ zio_write_compress(zio_t *zio) /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { - void *cbuf = NULL; + abd_t *cabd = NULL; if (abd_cmp_zero(zio->io_abd, lsize) == 0) psize = 0; else if (compress == ZIO_COMPRESS_EMPTY) psize = lsize; else - psize = zio_compress_data(compress, zio->io_abd, &cbuf, + psize = zio_compress_data(compress, zio->io_abd, &cabd, lsize, zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; - if (cbuf != NULL) - zio_buf_free(cbuf, lsize); + if (cabd != NULL) + abd_free(cabd); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { + void *cbuf = abd_borrow_buf_copy(cabd, lsize); encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); - zio_buf_free(cbuf, lsize); + abd_return_buf(cabd, cbuf, lsize); + abd_free(cabd); BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, @@ -1900,14 +1909,12 @@ zio_write_compress(zio_t *zio) psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_free(cabd); psize = lsize; } else { - abd_t *cdata = abd_get_from_buf(cbuf, lsize); - abd_take_ownership_of_buf(cdata, B_TRUE); - abd_zero_off(cdata, psize, rounded - psize); + abd_zero_off(cabd, psize, rounded - psize); psize = rounded; - zio_push_transform(zio, cdata, + zio_push_transform(zio, cabd, psize, lsize, NULL); } } @@ -3254,17 +3261,21 @@ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; + ddt_t *ddt; ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); - ddp = ddt_phys_select(dde, bp); - if (zio->io_error == 0) - ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + ddt = ddt_select(zio->io_spa, bp); - if (zio->io_error == 0 && dde->dde_repair_abd == NULL) - dde->dde_repair_abd = zio->io_abd; + if (zio->io_error == 0) { + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + /* this phys variant doesn't need repair */ + ddt_phys_clear(dde->dde_phys, v); + } + + if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) + dde->dde_io->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); @@ -3282,21 +3293,25 @@ zio_ddt_read_start(zio_t *zio) if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); - ddt_phys_t *ddp = dde->dde_phys; - ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); + ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); + ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; - if (ddp_self == NULL) + if (v_self == DDT_PHYS_NONE) return (zio); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + /* issue I/O for the other copies */ + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; - ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, - &blk); + + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, + ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, @@ -3338,8 +3353,8 @@ zio_ddt_read_done(zio_t *zio) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } - if (dde->dde_repair_abd != NULL) { - abd_copy(zio->io_abd, dde->dde_repair_abd, + if (dde->dde_io->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } @@ -3372,28 +3387,36 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) * loaded). */ - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - zio_t *lio = dde->dde_lead_zio[p]; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + if (DDT_PHYS_IS_DITTO(ddt, p)) + continue; - if (lio != NULL && do_raw) { + if (dde->dde_io == NULL) + continue; + + zio_t *lio = dde->dde_io->dde_lead_zio[p]; + if (lio == NULL) + continue; + + if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); - } else if (lio != NULL) { - return (lio->io_orig_size != zio->io_orig_size || - abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); - } + + return (lio->io_orig_size != zio->io_orig_size || + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); - if (ddp->ddp_phys_birth != 0 && do_raw) { + if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) @@ -3416,13 +3439,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) abd_free(tmpabd); ddt_enter(ddt); return (error != 0); - } else if (ddp->ddp_phys_birth != 0) { + } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); @@ -3450,50 +3473,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) } static void -zio_ddt_child_write_ready(zio_t *zio) +zio_ddt_child_write_done(zio_t *zio) { - int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; - zio_t *pio; - if (zio->io_error) - return; + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); + + int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); - ASSERT(dde->dde_lead_zio[p] == zio); + /* we're the lead, so once we're done there's no one else outstanding */ + if (dde->dde_io->dde_lead_zio[p] == zio) + dde->dde_io->dde_lead_zio[p] = NULL; - ddt_phys_fill(ddp, zio->io_bp); + ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(zio, &zl)) != NULL) - ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + if (zio->io_error != 0) { + /* + * The write failed, so we're about to abort the entire IO + * chain. We need to revert the entry back to what it was at + * the last time it was successfully extended. + */ + ddt_phys_copy(ddp, orig, v); + ddt_phys_clear(orig, v); + + ddt_exit(ddt); + return; + } + + /* + * We've successfully added new DVAs to the entry. Clear the saved + * state or, if there's still outstanding IO, remember it so we can + * revert to a known good state if that IO fails. + */ + if (dde->dde_io->dde_lead_zio[p] == NULL) + ddt_phys_clear(orig, v); + else + ddt_phys_copy(orig, ddp, v); + + /* + * Add references for all dedup writes that were waiting on the + * physical one, skipping any other physical writes that are waiting. + */ + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_phys_addref(ddp, v); + } ddt_exit(ddt); } static void -zio_ddt_child_write_done(zio_t *zio) +zio_ddt_child_write_ready(zio_t *zio) { - int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; + + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); + + int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (zio->io_error != 0) + return; ddt_enter(ddt); - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_lead_zio[p] == zio); - dde->dde_lead_zio[p] = NULL; + ddt_phys_extend(dde->dde_phys, v, zio->io_bp); - if (zio->io_error == 0) { - zio_link_t *zl = NULL; - while (zio_walk_parents(zio, &zl) != NULL) - ddt_phys_addref(ddp); - } else { - ddt_phys_clear(ddp); + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); @@ -3506,11 +3566,8 @@ zio_ddt_write(zio_t *zio) blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; - int p = zp->zp_copies; - zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; - ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); @@ -3518,7 +3575,7 @@ zio_ddt_write(zio_t *zio) ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; @@ -3528,7 +3585,6 @@ zio_ddt_write(zio_t *zio) ddt_exit(ddt); return (zio); } - ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* @@ -3553,29 +3609,227 @@ zio_ddt_write(zio_t *zio) return (zio); } - if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { - if (ddp->ddp_phys_birth != 0) - ddt_bp_fill(ddp, bp, txg); - if (dde->dde_lead_zio[p] != NULL) - zio_add_child(zio, dde->dde_lead_zio[p]); - else - ddt_phys_addref(ddp); - } else if (zio->io_bp_override) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); - ASSERT(BP_EQUAL(bp, zio->io_bp_override)); - ddt_phys_fill(ddp, bp); - ddt_phys_addref(ddp); - } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; - zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_lead_zio[p] = cio; + /* + * In the common cases, at this point we have a regular BP with no + * allocated DVAs, and the corresponding DDT entry for its checksum. + * Our goal is to fill the BP with enough DVAs to satisfy its copies= + * requirement. + * + * One of three things needs to happen to fulfill this: + * + * - if the DDT entry has enough DVAs to satisfy the BP, we just copy + * them out of the entry and return; + * + * - if the DDT entry has no DVAs (ie its brand new), then we have to + * issue the write as normal so that DVAs can be allocated and the + * data land on disk. We then copy the DVAs into the DDT entry on + * return. + * + * - if the DDT entry has some DVAs, but too few, we have to issue the + * write, adjusted to have allocate fewer copies. When it returns, we + * add the new DVAs to the DDT entry, and update the BP to have the + * full amount it originally requested. + * + * In all cases, if there's already a writing IO in flight, we need to + * defer the action until after the write is done. If our action is to + * write, we need to adjust our request for additional DVAs to match + * what will be in the DDT entry after it completes. In this way every + * IO can be guaranteed to recieve enough DVAs simply by joining the + * end of the chain and letting the sequence play out. + */ + + /* + * Number of DVAs in the DDT entry. If the BP is encrypted we ignore + * the third one as normal. + */ + int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); + IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); + + /* Number of DVAs requested bya the IO. */ + uint8_t need_dvas = zp->zp_copies; + + /* + * What we do next depends on whether or not there's IO outstanding that + * will update this entry. + */ + if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * No IO outstanding, so we only need to worry about ourselves. + */ + + /* + * Override BPs bring their own DVAs and their own problems. + */ + if (zio->io_bp_override) { + /* + * For a brand-new entry, all the work has been done + * for us, and we can just fill it out from the provided + * block and leave. + */ + if (have_dvas == 0) { + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_extend(ddp, v, bp); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * If we already have this entry, then we want to treat + * it like a regular write. To do this we just wipe + * them out and proceed like a regular write. + * + * Even if there are some DVAs in the entry, we still + * have to clear them out. We can't use them to fill + * out the dedup entry, as they are all referenced + * together by a bp already on disk, and will be freed + * as a group. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + /* + * If there are enough DVAs in the entry to service our request, + * then we can just use them as-is. + */ + if (have_dvas >= need_dvas) { + ddt_bp_fill(ddp, v, bp, txg); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * Otherwise, we have to issue IO to fill the entry up to the + * amount we need. + */ + need_dvas -= have_dvas; + } else { + /* + * There's a write in-flight. If there's already enough DVAs on + * the entry, then either there were already enough to start + * with, or the in-flight IO is between READY and DONE, and so + * has extended the entry with new DVAs. Either way, we don't + * need to do anything, we can just slot in behind it. + */ + + if (zio->io_bp_override) { + /* + * If there's a write out, then we're soon going to + * have our own copies of this block, so clear out the + * override block and treat it as a regular dedup + * write. See comment above. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + if (have_dvas >= need_dvas) { + /* + * A minor point: there might already be enough + * committed DVAs in the entry to service our request, + * but we don't know which are completed and which are + * allocated but not yet written. In this case, should + * the IO for the new DVAs fail, we will be on the end + * of the IO chain and will also recieve an error, even + * though our request could have been serviced. + * + * This is an extremely rare case, as it requires the + * original block to be copied with a request for a + * larger number of DVAs, then copied again requesting + * the same (or already fulfilled) number of DVAs while + * the first request is active, and then that first + * request errors. In return, the logic required to + * catch and handle it is complex. For now, I'm just + * not going to bother with it. + */ + + /* + * We always fill the bp here as we may have arrived + * after the in-flight write has passed READY, and so + * missed out. + */ + ddt_bp_fill(ddp, v, bp, txg); + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } + + /* + * There's not enough in the entry yet, so we need to look at + * the write in-flight and see how many DVAs it will have once + * it completes. + * + * The in-flight write has potentially had its copies request + * reduced (if we're filling out an existing entry), so we need + * to reach in and get the original write to find out what it is + * expecting. + * + * Note that the parent of the lead zio will always have the + * highest zp_copies of any zio in the chain, because ones that + * can be serviced without additional IO are always added to + * the back of the chain. + */ + zio_link_t *zl = NULL; + zio_t *pio = + zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); + ASSERT(pio); + uint8_t parent_dvas = pio->io_prop.zp_copies; + + if (parent_dvas >= need_dvas) { + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } + + /* + * Still not enough, so we will need to issue to get the + * shortfall. + */ + need_dvas -= parent_dvas; } + /* + * We need to write. We will create a new write with the copies + * property adjusted to match the number of DVAs we need to need to + * grow the DDT entry by to satisfy the request. + */ + zio_prop_t czp = *zp; + czp.zp_copies = need_dvas; + zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, &czp, + zio_ddt_child_write_ready, NULL, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); + + /* + * We are the new lead zio, because our parent has the highest + * zp_copies that has been requested for this entry so far. + */ + ddt_alloc_entry_io(dde); + if (dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * First time out, take a copy of the stable entry to revert + * to if there's an error (see zio_ddt_child_write_done()) + */ + ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); + } else { + /* + * Make the existing chain our child, because it cannot + * complete until we have. + */ + zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); + } + dde->dde_io->dde_lead_zio[p] = cio; + ddt_exit(ddt); zio_nowait(cio); @@ -3591,21 +3845,30 @@ zio_ddt_free(zio_t *zio) spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; - ddt_phys_t *ddp; + ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + freedde = dde = ddt_lookup(ddt, bp); if (dde) { - ddp = ddt_phys_select(dde, bp); - if (ddp) - ddt_phys_decref(ddp); + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + if (v != DDT_PHYS_NONE) + ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt); + /* + * When no entry was found, it must have been pruned, + * so we can free it now instead of decrementing the + * refcount in the DDT. + */ + if (!dde) { + BP_SET_DEDUP(bp, 0); + zio->io_pipeline |= ZIO_STAGE_DVA_FREE; + } + return (zio); } diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c index e12d5498ccda..9182917f75eb 100644 --- a/sys/contrib/openzfs/module/zfs/zio_compress.c +++ b/sys/contrib/openzfs/module/zfs/zio_compress.c @@ -29,7 +29,7 @@ /* * Copyright (c) 2013, 2018 by Delphix. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ @@ -48,26 +48,42 @@ static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. + * + * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS. + * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE + * PART OF THE ON-DISK FORMAT. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {"inherit", 0, NULL, NULL, NULL}, - {"on", 0, NULL, NULL, NULL}, - {"uncompressed", 0, NULL, NULL, NULL}, - {"lzjb", 0, lzjb_compress, lzjb_decompress, NULL}, - {"empty", 0, NULL, NULL, NULL}, - {"gzip-1", 1, gzip_compress, gzip_decompress, NULL}, - {"gzip-2", 2, gzip_compress, gzip_decompress, NULL}, - {"gzip-3", 3, gzip_compress, gzip_decompress, NULL}, - {"gzip-4", 4, gzip_compress, gzip_decompress, NULL}, - {"gzip-5", 5, gzip_compress, gzip_decompress, NULL}, - {"gzip-6", 6, gzip_compress, gzip_decompress, NULL}, - {"gzip-7", 7, gzip_compress, gzip_decompress, NULL}, - {"gzip-8", 8, gzip_compress, gzip_decompress, NULL}, - {"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, - {"zle", 64, zle_compress, zle_decompress, NULL}, - {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, - {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap, - zfs_zstd_decompress, zfs_zstd_decompress_level}, + {"inherit", 0, NULL, NULL, NULL}, + {"on", 0, NULL, NULL, NULL}, + {"uncompressed", 0, NULL, NULL, NULL}, + {"lzjb", 0, + zfs_lzjb_compress, zfs_lzjb_decompress, NULL}, + {"empty", 0, NULL, NULL, NULL}, + {"gzip-1", 1, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-2", 2, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-3", 3, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-4", 4, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-5", 5, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-6", 6, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-7", 7, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-8", 8, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"gzip-9", 9, + zfs_gzip_compress, zfs_gzip_decompress, NULL}, + {"zle", 64, + zfs_zle_compress, zfs_zle_decompress, NULL}, + {"lz4", 0, + zfs_lz4_compress, zfs_lz4_decompress, NULL}, + {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, + zfs_zstd_compress, zfs_zstd_decompress, zfs_zstd_decompress_level}, }; uint8_t @@ -112,20 +128,16 @@ zio_compress_select(spa_t *spa, enum zio_compress child, } size_t -zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len, +zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len, uint8_t level) { size_t c_len, d_len; uint8_t complevel; zio_compress_info_t *ci = &zio_compress_table[c]; - ASSERT3U(c, <, ZIO_COMPRESS_FUNCTIONS); ASSERT3U(ci->ci_compress, !=, NULL); ASSERT3U(s_len, >, 0); - /* Compress at least 12.5% */ - d_len = s_len - (s_len >> 3); - complevel = ci->ci_level; if (c == ZIO_COMPRESS_ZSTD) { @@ -142,12 +154,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len, } if (*dst == NULL) - *dst = zio_buf_alloc(s_len); + *dst = abd_alloc_sametype(src, s_len); - /* No compression algorithms can read from ABDs directly */ - void *tmp = abd_borrow_buf_copy(src, s_len); - c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel); - abd_return_buf(src, tmp, s_len); + /* Compress at least 12.5%, but limit to the size of the dest abd. */ + d_len = MIN(s_len - (s_len >> 3), abd_get_size(*dst)); + + c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel); if (c_len > d_len) return (s_len); @@ -157,26 +169,18 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len, } int -zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, +zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *level) { zio_compress_info_t *ci = &zio_compress_table[c]; if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); + int err; if (ci->ci_decompress_level != NULL && level != NULL) - return (ci->ci_decompress_level(src, dst, s_len, d_len, level)); - - return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); -} - -int -zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len, uint8_t *level) -{ - void *tmp = abd_borrow_buf_copy(src, s_len); - int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level); - abd_return_buf(src, tmp, s_len); + err = ci->ci_decompress_level(src, dst, s_len, d_len, level); + else + err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); /* * Decompression shouldn't fail, because we've already verified @@ -185,9 +189,9 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, */ if (zio_decompress_fail_fraction != 0 && random_in_range(zio_decompress_fail_fraction) == 0) - ret = SET_ERROR(EINVAL); + err = SET_ERROR(EINVAL); - return (ret); + return (err); } int diff --git a/sys/contrib/openzfs/module/zfs/zle.c b/sys/contrib/openzfs/module/zfs/zle.c index 1483a65af803..7810161966dc 100644 --- a/sys/contrib/openzfs/module/zfs/zle.c +++ b/sys/contrib/openzfs/module/zfs/zle.c @@ -34,8 +34,9 @@ #include #include -size_t -zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +static size_t +zfs_zle_compress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) { uchar_t *src = s_start; uchar_t *dst = d_start; @@ -64,8 +65,9 @@ zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) return (src == s_end ? dst - (uchar_t *)d_start : s_len); } -int -zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +static int +zfs_zle_decompress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) { uchar_t *src = s_start; uchar_t *dst = d_start; @@ -89,3 +91,6 @@ zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) } return (dst == d_end ? 0 : -1); } + +ZFS_COMPRESS_WRAP_DECL(zfs_zle_compress) +ZFS_DECOMPRESS_WRAP_DECL(zfs_zle_decompress) diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 76b5e2759f4f..2100383a6883 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -56,6 +56,10 @@ static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; static unsigned int zstd_abort_size = (128 * 1024); #endif +#ifdef IN_BASE +int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int); +#endif + static kstat_t *zstd_ksp = NULL; typedef struct zstd_stats { @@ -436,7 +440,7 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) } #ifndef IN_LIBSA -size_t +static size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, int level) { @@ -469,7 +473,7 @@ zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && s_len >= actual_abort_size) { int pass_len = 1; - pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0); + pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0); if (pass_len < d_len) { ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); goto keep_trying; @@ -495,8 +499,8 @@ zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, } /* Compress block using zstd */ -size_t -zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, +static size_t +zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len, int level) { size_t c_len; @@ -599,11 +603,74 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, return (c_len + sizeof (*hdr)); } + +static size_t +zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level) +{ + int16_t zstd_level; + if (zstd_enum_to_level(level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_com_inval); + return (s_len); + } + /* + * A zstd early abort heuristic. + * + * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently + * 128k), don't try any of this, just go. + * (because experimentally that was a reasonable cutoff for a perf win + * with tiny ratio change) + * - First, we try LZ4 compression, and if it doesn't early abort, we + * jump directly to whatever compression level we intended to try. + * - Second, we try zstd-1 - if that errors out (usually, but not + * exclusively, if it would overflow), we give up early. + * + * If it works, instead we go on and compress anyway. + * + * Why two passes? LZ4 alone gets you a lot of the way, but on highly + * compressible data, it was losing up to 8.5% of the compressed + * savings versus no early abort, and all the zstd-fast levels are + * worse indications on their own than LZ4, and don't improve the LZ4 + * pass noticably if stacked like this. + */ + size_t actual_abort_size = zstd_abort_size; + if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && + s_len >= actual_abort_size) { + int pass_len = 1; + abd_t sabd, dabd; + abd_get_from_buf_struct(&sabd, s_start, s_len); + abd_get_from_buf_struct(&dabd, d_start, d_len); + pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0); + abd_free(&dabd); + abd_free(&sabd); + if (pass_len < d_len) { + ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); + goto keep_trying; + } + ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); + + pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len, + d_len, ZIO_ZSTD_LEVEL_1); + if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { + ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); + return (s_len); + } + ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); + } else { + ZSTDSTAT_BUMP(zstd_stat_passignored); + if (s_len < actual_abort_size) { + ZSTDSTAT_BUMP(zstd_stat_passignored_size); + } + } +keep_trying: + return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level)); + +} #endif /* Decompress block using zstd and return its stored level */ -int -zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, +static int +zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, uint8_t *level) { ZSTD_DCtx *dctx; @@ -678,16 +745,31 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, } /* Decompress datablock using zstd */ +#ifdef IN_BASE int -zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, - int level __maybe_unused) +zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int level __maybe_unused) { - return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, + return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, NULL)); } +#else +static int +zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, + size_t d_len, int level __maybe_unused) +{ + + return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, + NULL)); +} +#endif #ifndef IN_LIBSA +ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress) +ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress) +ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level) + /* Allocator for zstd compression context using mempool_allocator */ static void * zstd_alloc(void *opaque __maybe_unused, size_t size) @@ -704,8 +786,8 @@ zstd_alloc(void *opaque __maybe_unused, size_t size) return ((void*)z + (sizeof (struct zstd_kmem))); } -#endif +#endif /* * Allocator for zstd decompression context using mempool_allocator with * fallback to reserved memory if allocation fails diff --git a/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in b/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in index 4cc075585d4b..30524474d1ac 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in @@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do %{?kernel_cc} \ %{?kernel_ld} \ %{?kernel_llvm} + + # Pre-6.10 kernel builds didn't need to copy over the source files to the + # build directory. However we do need to do it though post-6.10 due to + # these commits: + # + # b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source + # directory + # + # 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern + # rules + # + # Note that kmodtool actually copies over the source into the build + # directory, so what we're doing here is normal. For efficiency reasons + # though we just use hardlinks instead of copying. + # + # See https://github.com/openzfs/zfs/issues/16439 for more info. + cp -lR ../%{module}-%{version}/module/* module/ + make %{?_smp_mflags} cd .. done diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 326eb2a44d37..088e46ce578c 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -514,6 +514,10 @@ tags = ['functional', 'cli_root', 'zpool_offline'] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] +[tests/functional/cli_root/zpool_reguid] +tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg'] +tags = ['functional', 'cli_root', 'zpool_reguid'] + [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] @@ -672,7 +676,9 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_quota'] +tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import', + 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', + 'dedup_legacy_fdt_mixed', 'dedup_quota'] pre = post = tags = ['functional', 'dedup'] diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am index 23848a82ffbd..a8df06c2e990 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am @@ -24,7 +24,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/badsend scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test -%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) +%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) %C%_btree_test_LDADD = \ libzpool.la \ libzfs_core.la diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index 3de316a12504..96943421f84c 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special +DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index 9dcb097e2b38..bbeabc6dfb42 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/deadman/deadman_zio.ksh \ functional/dedup/cleanup.ksh \ functional/dedup/setup.ksh \ + functional/dedup/dedup_fdt_create.ksh \ + functional/dedup/dedup_fdt_import.ksh \ + functional/dedup/dedup_legacy_create.ksh \ + functional/dedup/dedup_legacy_import.ksh \ + functional/dedup/dedup_legacy_fdt_upgrade.ksh \ + functional/dedup/dedup_legacy_fdt_mixed.ksh \ functional/dedup/dedup_quota.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index e8a94ce209bc..50c1b7a9d09e 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -109,5 +109,6 @@ if is_linux || is_freebsd; then "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" + "feature@fast_dedup" ) fi diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh index a96a38ff178a..474f41eae8f3 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh @@ -95,6 +95,10 @@ while (( i < 16384 )); do done ((i += 1)) done + +# Force the DDT logs to disk with a scrub so they can be prefetched +log_must zpool scrub -w $TESTPOOL + log_note "Dataset generation completed." typeset -A generated diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am new file mode 100644 index 000000000000..87d46b394015 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am @@ -0,0 +1,6 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_reguid +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zpool_reguid_001_pos.ksh \ + zpool_reguid_002_neg.ksh diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh new file mode 100755 index 000000000000..3167a5097b5a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh new file mode 100755 index 000000000000..3d866cfd9f20 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh new file mode 100755 index 000000000000..4e18abd988cd --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright 2023 Mateusz Piotrowski +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool reguid' can change pool's GUID. +# +# STRATEGY: +# 1. Use zpool get to obtain the initial GUID of a pool. +# 2. Change pool's GUID with zpool reguid. +# 3. Verify the GUID has changed to a random GUID. +# +# 4. Change pool's GUID with zpool reguid -g. +# 5. Verify the GUID has changed to the specified GUID. +# + +# set_guid guid [expected_guid] +set_guid() { + gflag_guid="$1" + expected_guid="${2:-"$gflag_guid"}" + + initial_guid="$(zpool get -H -o value guid "$TESTPOOL")" + log_assert "Verify 'zpool reguid -g \"$gflag_guid\"' sets GUID as expected." + log_must zpool reguid -g "$gflag_guid" "$TESTPOOL" + retrieved_guid="$(zpool get -H -o value guid "$TESTPOOL")" + if [[ "$retrieved_guid" == "" ]]; then + log_fail "Unable to obtain the new GUID of pool $TESTPOOL" + fi + if [[ "$expected_guid" != "$retrieved_guid" ]]; then + log_fail "GUID set to '$retrieved_guid' instead of '$expected_guid'" + fi +} + +log_assert "Verify 'zpool reguid' picks a new random GUID for the pool." +initial_guid="$(zpool get -H -o value guid "$TESTPOOL")" +if [[ $initial_guid == "" ]]; then + log_fail "Unable to obtain the initial GUID of pool $TESTPOOL" +fi +log_must zpool reguid "$TESTPOOL" +new_guid="$(zpool get -H -o value guid "$TESTPOOL")" +if [[ "$new_guid" == "" ]]; then + log_fail "Unable to obtain the new GUID of pool $TESTPOOL" +fi +if [[ "$initial_guid" == "$new_guid" ]]; then + log_fail "GUID change failed; GUID has not changed: $initial_guid" +fi + +for g in "$(bc -e '2^64 - 1')" 0; do + set_guid "$g" +done +# zpool-reguid(8) will strip the leading 0. +set_guid 0123 "123" +# GUID "-1" is effectively 2^64 - 1 in value. +set_guid -1 "$(bc -e '2^64 - 1')" + +log_pass "'zpool reguid' changes GUID as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh new file mode 100755 index 000000000000..599041e284e2 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright 2023 Mateusz Piotrowski +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool reguid' does not accept invalid GUIDs. +# +# STRATEGY: +# 1. Call zpool reguid with an invalid GUID. +# 2. Verify that the call fails. +# 3. Verify that the pool GUID did not change. +# +# 4. Call zpool reguid with a GUID that is already in use. +# 5. Verify that the call fails. +# + +check_guid() { + invalid_guid="$1" + initial_guid="$(zpool get -H -o value guid "$TESTPOOL")" + log_assert "'zpool reguid' will not accept invalid GUID '$invalid_guid'" + if zpool reguid -g "$invalid_guid" "$TESTPOOL"; then + log_fail "'zpool reguid' accepted invalid GUID: $invalid_guid" + fi + final_guid="$(zpool get -H -o value guid "$TESTPOOL")" + if [[ "$initial_guid" != "$final_guid" ]]; then + log_fail "Invalid GUID change from '$initial_guid' to '$final_guid'" + fi +} + +log_assert "Verify 'zpool reguid' does not accept invalid GUIDs" + +for ig in "$(bc -e '2^64')" 0xA 0xa; do + check_guid "$ig" +done + +guid="42" +log_assert "Verify 'zpool reguid -g' does not accept GUID which are already in use" +log_must zpool reguid -g "$guid" "$TESTPOOL" +if zpool reguid -g "$guid" "$TESTPOOL"; then + log_fail "'zpool reguid' accepted GUID that was already in use: $invalid_guid" +fi + +log_pass "'zpool reguid' does not accept invalid GUIDs." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh new file mode 100755 index 000000000000..4f6e5805bb3a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Simple test of dedup table operations (FDT) + +. $STF_SUITE/include/libtest.shlib + +log_assert "basic dedup (FDT) operations work" + +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + +function cleanup +{ + destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX +} + +log_onexit cleanup + +# create a pool with fast dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=enabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with only one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +# copy the file +log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must zpool sync + +# now four entries in the duplicate table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" + +# now two DDT ZAPs in the container object; DDT ZAPs aren't cleaned up until +# the entire logical table is destroyed +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 2 + +# remove the files +log_must rm -f /$TESTPOOL/file* +log_must zpool sync + +# feature should move back to enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; containing object destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0 + +log_pass "basic dedup (FDT) operations work" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh new file mode 100755 index 000000000000..259eaddc0843 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh @@ -0,0 +1,119 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Ensure dedup retains version after import (FDT) + +. $STF_SUITE/include/libtest.shlib + +log_assert "dedup (FDT) retains version after import" + +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + +function cleanup +{ + destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX +} + +log_onexit cleanup + +# create a pool with fast dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=enabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with only one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +# export and import the pool +zpool export $TESTPOOL +zpool import $TESTPOOL + +# feature still active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# remove the file +log_must rm -f /$TESTPOOL/file1 +log_must zpool sync + +# feature should revert to enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; containing object destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0 + +# create a new file +log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4 +log_must zpool sync + +# feature should be active again +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with only one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +log_pass "dedup (FDT) retains version after import" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh new file mode 100755 index 000000000000..e3efcf5c8b36 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Simple test of dedup table operations (legacy) + +. $STF_SUITE/include/libtest.shlib + +log_assert "basic dedup (legacy) operations work" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +# copy the file +log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must zpool sync + +# now four entries in the duplicate table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" + +# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire +# logical table is destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2 + +# remove the files +log_must rm -f /$TESTPOOL/file* +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; all DDT ZAPs removed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 + +log_pass "basic dedup (legacy) operations work" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh new file mode 100755 index 000000000000..114cf0266e12 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Check legacy dedup table continues to work after pool upgrade to fast_dedup, +# but if deleted and recreated, the new table is FDT + +. $STF_SUITE/include/libtest.shlib + +log_assert "legacy and FDT dedup tables on the same pool can happily coexist" + +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + +function cleanup +{ + destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# create two datasets, enabling a different dedup algorithm on each +log_must zfs create -o dedup=skein $TESTPOOL/ds1 +log_must zfs create -o dedup=blake3 $TESTPOOL/ds2 + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-skein" +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-blake3" + +# create a file in the first dataset +log_must dd if=/dev/urandom of=/$TESTPOOL/ds1/file1 bs=128k count=4 +log_must zpool sync + +# should be four entries in the skein unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-skein-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1 + +# enable the fast_dedup feature +log_must zpool set feature@fast_dedup=enabled $TESTPOOL + +# confirm the feature is now enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# create a file in the first dataset +log_must dd if=/dev/urandom of=/$TESTPOOL/ds2/file1 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# now also four entries in the blake3 unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-blake3-zap-unique: 4 entries'" + +# two entries in the MOS: the legacy skein DDT ZAP, and the containing dir for +# the blake3 FDT table +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1 +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | wc -l) -eq 1 + +# containing object has one ZAP inside +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | awk '{ print $NF }') +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-.*-zap- | wc -l) -eq 1 + +log_pass "legacy and FDT dedup tables on the same pool can happily coexist" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh new file mode 100755 index 000000000000..c36463134fde --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh @@ -0,0 +1,129 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Check legacy dedup table continues to work after pool upgrade to fast_dedup, +# but if deleted and recreated, the new table is FDT + +. $STF_SUITE/include/libtest.shlib + +log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT" + +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + +function cleanup +{ + destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_TXG_MAX +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +# enable the fast_dedup feature +log_must zpool set feature@fast_dedup=enabled $TESTPOOL + +# confirm the feature is now enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# copy the file +log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must zpool sync + +# feature should still be enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# now four entries in the duplicate table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" + +# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire +# logical table is destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2 + +# remove the files +log_must rm -f /$TESTPOOL/file* +log_must zpool sync + +# feature should still be enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; all DDT ZAPs removed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 + +# create a new file +log_must dd if=/dev/urandom of=/$TESTPOOL/file3 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +log_pass "legacy dedup tables work after upgrade; new dedup tables created as FDT" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh new file mode 100755 index 000000000000..a7b667eaf882 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Ensure dedup retains version after import (legacy) + +. $STF_SUITE/include/libtest.shlib + +log_assert "dedup (legacy) retains version after import" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +# export and import the pool +zpool export $TESTPOOL +zpool import $TESTPOOL + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# remove the file +log_must rm -f /$TESTPOOL/file1 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; all DDT ZAPs removed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 + +# create a new file +log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +log_pass "dedup (legacy) retains version after import" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index 5b83a1ca396f..326152b510a9 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -51,6 +51,12 @@ POOL="dedup_pool" save_tunable TXG_TIMEOUT +# we set the dedup log txg interval to 1, to get a log flush every txg, +# effectively disabling the log. without this it's hard to predict when and +# where things appear on-disk +log_must save_tunable DEDUP_LOG_TXG_MAX +log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 + function cleanup { if poolexists $POOL ; then @@ -58,6 +64,7 @@ function cleanup fi log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR log_must restore_tunable TXG_TIMEOUT + log_must restore_tunable DEDUP_LOG_TXG_MAX } @@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit # # With no DDT quota in place, the above workload will produce over - # 800,000 entries by using space in the normal class. With a quota, - # it will be well below 500,000 entries. + # 800,000 entries by using space in the normal class. With a quota, it + # should be well under 500,000. However, logged entries are hard to + # account for because they can appear on both logs, and can also + # represent an eventual removal. This isn't easily visible from + # outside, and even internally can result in going slightly over quota. + # For here, we just set the entry count a little higher than what we + # expect to allow for some instability. # - log_must test $(ddt_entries) -le 500000 + log_must test $(ddt_entries) -le 600000 do_clean } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/setup.ksh index 3c0830401f81..a21238879faf 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/setup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/dedup/setup.ksh @@ -25,7 +25,3 @@ # . $STF_SUITE/include/libtest.shlib - -DISK=${DISKS%% *} - -default_setup $DISK diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile index d45488dcb346..7fba05b962ce 100644 --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -217,6 +217,7 @@ SRCS+= zfeature_common.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ + zfs_valstr.c \ zpool_prop.c \ zprop_common.c @@ -237,6 +238,7 @@ SRCS+= abd.c \ bqueue.c \ dataset_kstats.c \ ddt.c \ + ddt_log.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ @@ -389,6 +391,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast CFLAGS.abd.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.ddt_zap.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index 2508de5421df..2edd53c4001d 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1215,7 +1215,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.99-634-FreeBSD_gd2ccc2155" +#define ZFS_META_ALIAS "zfs-2.2.99-693-FreeBSD_gb10992582" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1224,7 +1224,7 @@ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.9" +#define ZFS_META_KVER_MAX "6.10" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" @@ -1245,7 +1245,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "634-FreeBSD_gd2ccc2155" +#define ZFS_META_RELEASE "693-FreeBSd_gb10992582" /* Define the project version. */ #define ZFS_META_VERSION "2.2.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 23b7a5afa4ce..2113c3fc2680 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.99-634-gd2ccc2155" +#define ZFS_META_GITREV "zfs-2.2.99-693-gb10992582" diff --git a/usr.sbin/fstyp/Makefile b/usr.sbin/fstyp/Makefile index 20f71aab44c9..ae5bae2e4f9d 100644 --- a/usr.sbin/fstyp/Makefile +++ b/usr.sbin/fstyp/Makefile @@ -29,6 +29,7 @@ CFLAGS.zfs.c+= -DIN_BASE CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzpool/include CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include CFLAGS.zfs.c+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS.zfs.c+= -DHAVE_ISSETUGID