9166 zfs storage pool checkpoint

illumos/illumos-gate@8671400134 The idea of Storage Pool Checkpoint (aka zpool checkpoint) deals with exactly that. It can be thought of as a “pool-wide snapshot” (or a variation of extreme rewind that doesn’t corrupt your data). It remembers the entire state of the pool at the point that it was taken and the user can revert back to it later or discard it. Its generic use case is an administrator that is about to perform a set of destructive actions to ZFS as part of a critical procedure. She takes a checkpoint of the pool before performing the actions, then rewinds back to it if one of them fails or puts the pool into an unexpected state. Otherwise, she discards it. With the assumption that no one else is making modifications to ZFS, she basically wraps all these actions into a “high-level transaction”. Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: John Kennedy <john.kennedy@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> Author: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
svn path=/vendor-sys/illumos/dist/; revision=331695
2024-12-29 12:03:03 +00:00 · 2018-03-28 18:12:06 +00:00 · 2018-03-28 18:12:06 +00:00 · b8a528e092 · 2020-12-20 02:59:44 +00:00
commit b8a528e092
parent 6027cbd474
59 changed files with 3163 additions and 649 deletions
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 */

 /*
@ -41,6 +41,7 @@
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
+#include <sys/spa_impl.h>
 #include <sys/abd.h>

 #include "zdb.h"
@ -162,7 +163,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg)
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
 		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
+		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);

@ -352,7 +353,7 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)

 	if (claim_txg != 0)
 		claim = "already claimed";
-	else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa))
+	else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
 		claim = "will claim";
 	else
 		claim = "won't claim";
@ -407,6 +408,11 @@ dump_intent_log(zilog_t *zilog)
 	for (i = 0; i < TX_MAX_TYPE; i++)
 		zil_rec_info[i].zri_count = 0;

+	/* see comment in zil_claim() or zil_check_log_chain() */
+	if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+	    zh->zh_claim_txg == 0)
+		return;
+
 	if (verbose >= 2) {
 		(void) printf("\n");
 		(void) zil_parse(zilog, print_log_block, print_log_record, NULL,
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@ -34,6 +34,7 @@
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <getopt.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <libuutil.h>
@ -65,6 +66,8 @@ static int zpool_do_add(int, char **);
 static int zpool_do_remove(int, char **);
 static int zpool_do_labelclear(int, char **);

+static int zpool_do_checkpoint(int, char **);
+
 static int zpool_do_list(int, char **);
 static int zpool_do_iostat(int, char **);
 static int zpool_do_status(int, char **);
@ -117,6 +120,7 @@ typedef enum {
 	HELP_ATTACH,
 	HELP_CLEAR,
 	HELP_CREATE,
+	HELP_CHECKPOINT,
 	HELP_DESTROY,
 	HELP_DETACH,
 	HELP_EXPORT,
@ -164,6 +168,8 @@ static zpool_command_t command_table[] = {
 	{ NULL },
 	{ "labelclear",	zpool_do_labelclear,	HELP_LABELCLEAR		},
 	{ NULL },
+	{ "checkpoint",	zpool_do_checkpoint,	HELP_CHECKPOINT		},
+	{ NULL },
 	{ "list",	zpool_do_list,		HELP_LIST		},
 	{ "iostat",	zpool_do_iostat,	HELP_IOSTAT		},
 	{ "status",	zpool_do_status,	HELP_STATUS		},
@ -213,6 +219,8 @@ get_usage(zpool_help_t idx)
 		    "[-o property=value] ... \n"
 		    "\t    [-O file-system-property=value] ... \n"
 		    "\t    [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
+	case HELP_CHECKPOINT:
+		return (gettext("\tcheckpoint [--discard] <pool> ...\n"));
 	case HELP_DESTROY:
 		return (gettext("\tdestroy [-f] <pool>\n"));
 	case HELP_DETACH:
@ -223,14 +231,13 @@ get_usage(zpool_help_t idx)
 		return (gettext("\thistory [-il] [<pool>] ...\n"));
 	case HELP_IMPORT:
 		return (gettext("\timport [-d dir] [-D]\n"
-		    "\timport [-d dir | -c cachefile] [-F [-n]] <pool | id>\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
 		    "[-R root] [-F [-n]] -a\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
 		    "[-R root] [-F [-n]]\n"
-		    "\t    <pool | id> [newpool]\n"));
+		    "\t    [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
 	case HELP_IOSTAT:
 		return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
 		    "[count]]\n"));
@ -2045,6 +2052,79 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 	return (0);
 }

+/*
+ * zpool checkpoint <pool>
+ *       checkpoint --discard <pool>
+ *
+ *       -d         Discard the checkpoint from a checkpointed
+ *       --discard  pool.
+ *
+ * Checkpoints the specified pool, by taking a "snapshot" of its
+ * current state. A pool can only have one checkpoint at a time.
+ */
+int
+zpool_do_checkpoint(int argc, char **argv)
+{
+	boolean_t discard;
+	char *pool;
+	zpool_handle_t *zhp;
+	int c, err;
+
+	struct option long_options[] = {
+		{"discard", no_argument, NULL, 'd'},
+		{0, 0, 0, 0}
+	};
+
+	discard = B_FALSE;
+	while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) {
+		switch (c) {
+		case 'd':
+			discard = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	pool = argv[0];
+
+	if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
+		/* As a special case, check for use of '/' in the name */
+		if (strchr(pool, '/') != NULL)
+			(void) fprintf(stderr, gettext("'zpool checkpoint' "
+			    "doesn't work on datasets. To save the state "
+			    "of a dataset from a specific point in time "
+			    "please use 'zfs snapshot'\n"));
+		return (1);
+	}
+
+	if (discard)
+		err = (zpool_discard_checkpoint(zhp) != 0);
+	else
+		err = (zpool_checkpoint(zhp) != 0);
+
+	zpool_close(zhp);
+
+	return (err);
+}
+
+#define	CHECKPOINT_OPT	1024
+
 /*
 * zpool import [-d dir] [-D]
 *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
@ -2086,6 +2166,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 *
 *       -o	Set property=value and/or temporary mount options (without '=').
 *
+ *       --rewind-to-checkpoint
+ *       	Import the pool and revert back to the checkpoint.
+ *
 * The import command scans for pools to import, and import pools based on pool
 * name and GUID.  The pool can also be renamed as part of the import process.
 */
@ -2119,8 +2202,15 @@ zpool_do_import(int argc, char **argv)
 	importargs_t idata = { 0 };
 	char *endptr;

+
+	struct option long_options[] = {
+		{"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
+		{0, 0, 0, 0}
+	};
+
 	/* check options */
-	while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX")) != -1) {
+	while ((c = getopt_long(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX",
+	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
@ -2198,6 +2288,9 @@ zpool_do_import(int argc, char **argv)
 		case 'X':
 			xtreme_rewind = B_TRUE;
 			break;
+		case CHECKPOINT_OPT:
+			flags |= ZFS_IMPORT_CHECKPOINT;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@ -3055,6 +3148,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,

 	switch (prop) {
 	case ZPOOL_PROP_EXPANDSZ:
+	case ZPOOL_PROP_CHECKPOINT:
 		if (value == 0)
 			(void) strlcpy(propval, "-", sizeof (propval));
 		else
@ -3127,6 +3221,8 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
 		    toplevel);
 		print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
 		    scripted, toplevel);
+		print_one_column(ZPOOL_PROP_CHECKPOINT,
+		    vs->vs_checkpoint_space, scripted, toplevel);
 		print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
 		    B_TRUE);
 		print_one_column(ZPOOL_PROP_FRAGMENTATION,
@ -3241,8 +3337,8 @@ zpool_do_list(int argc, char **argv)
 	int ret;
 	list_cbdata_t cb = { 0 };
 	static char default_props[] =
-	    "name,size,allocated,free,expandsize,fragmentation,capacity,"
-	    "dedupratio,health,altroot";
+	    "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
+	    "capacity,dedupratio,health,altroot";
 	char *props = default_props;
 	unsigned long interval = 0, count = 0;
 	zpool_list_t *list;
@ -3962,6 +4058,32 @@ typedef struct scrub_cbdata {
 	pool_scrub_cmd_t cb_scrub_cmd;
 } scrub_cbdata_t;

+static boolean_t
+zpool_has_checkpoint(zpool_handle_t *zhp)
+{
+	nvlist_t *config, *nvroot;
+
+	config = zpool_get_config(zhp, NULL);
+
+	if (config != NULL) {
+		pool_checkpoint_stat_t *pcs = NULL;
+		uint_t c;
+
+		nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+
+		if (pcs == NULL || pcs->pcs_state == CS_NONE)
+			return (B_FALSE);
+
+		assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS ||
+		    pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
 int
 scrub_callback(zpool_handle_t *zhp, void *data)
 {
@ -3979,6 +4101,13 @@ scrub_callback(zpool_handle_t *zhp, void *data)

 	err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);

+	if (err == 0 && zpool_has_checkpoint(zhp) &&
+	    cb->cb_type == POOL_SCAN_SCRUB) {
+		(void) printf(gettext("warning: will not scrub state that "
+		    "belongs to the checkpoint of pool '%s'\n"),
+		    zpool_get_name(zhp));
+	}
+
 	return (err != 0);
 }

@ -4171,6 +4300,40 @@ print_scan_status(pool_scan_stat_t *ps)
 	}
 }

+/*
+ * As we don't scrub checkpointed blocks, we want to warn the
+ * user that we skipped scanning some blocks if a checkpoint exists
+ * or existed at any time during the scan.
+ */
+static void
+print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
+{
+	if (ps == NULL || pcs == NULL)
+		return;
+
+	if (pcs->pcs_state == CS_NONE ||
+	    pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+		return;
+
+	assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
+
+	if (ps->pss_state == DSS_NONE)
+		return;
+
+	if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) &&
+	    ps->pss_end_time < pcs->pcs_start_time)
+		return;
+
+	if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) {
+		(void) printf(gettext("    scan warning: skipped blocks "
+		    "that are only referenced by the checkpoint.\n"));
+	} else {
+		assert(ps->pss_state == DSS_SCANNING);
+		(void) printf(gettext("    scan warning: skipping blocks "
+		    "that are only referenced by the checkpoint.\n"));
+	}
+}
+
 /*
 * Print out detailed removal status.
 */
@ -4276,6 +4439,39 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
 	}
 }

+static void
+print_checkpoint_status(pool_checkpoint_stat_t *pcs)
+{
+	time_t start;
+	char space_buf[7];
+
+	if (pcs == NULL || pcs->pcs_state == CS_NONE)
+		return;
+
+	(void) printf(gettext("checkpoint: "));
+
+	start = pcs->pcs_start_time;
+	zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
+
+	if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
+		char *date = ctime(&start);
+
+		/*
+		 * ctime() adds a newline at the end of the generated
+		 * string, thus the weird format specifier and the
+		 * strlen() call used to chop it off from the output.
+		 */
+		(void) printf(gettext("created %.*s, consumes %s\n"),
+		    strlen(date) - 1, date, space_buf);
+		return;
+	}
+
+	assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+
+	(void) printf(gettext("discarding, %s remaining.\n"),
+	    space_buf);
+}
+
 static void
 print_error_log(zpool_handle_t *zhp)
 {
@ -4647,16 +4843,21 @@ status_callback(zpool_handle_t *zhp, void *data)
 		uint64_t nerr;
 		nvlist_t **spares, **l2cache;
 		uint_t nspares, nl2cache;
+		pool_checkpoint_stat_t *pcs = NULL;
 		pool_scan_stat_t *ps = NULL;
 		pool_removal_stat_t *prs = NULL;

+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
-		print_scan_status(ps);
-
 		(void) nvlist_lookup_uint64_array(nvroot,
 		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+
+		print_scan_status(ps);
+		print_checkpoint_scan_warning(ps, pcs);
 		print_removal_status(zhp, prs);
+		print_checkpoint_status(pcs);

 		namewidth = max_width(zhp, nvroot, 0, 0);
 		if (namewidth < 10)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@ -344,6 +344,7 @@ ztest_func_t ztest_reguid;
 ztest_func_t ztest_spa_upgrade;
 ztest_func_t ztest_device_removal;
 ztest_func_t ztest_remap_blocks;
+ztest_func_t ztest_spa_checkpoint_create_discard;

 uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
 uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
@ -386,7 +387,8 @@ ztest_info_t ztest_info[] = {
 	{ ztest_vdev_aux_add_remove,		1,
 	    &ztest_opts.zo_vdevtime				},
 	{ ztest_device_removal,			1,	&zopt_sometimes	},
-	{ ztest_remap_blocks,			1,	&zopt_sometimes }
+	{ ztest_remap_blocks,			1,	&zopt_sometimes },
+	{ ztest_spa_checkpoint_create_discard,	1,	&zopt_rarely	}
 };

 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
@ -432,6 +434,7 @@ static spa_t *ztest_spa = NULL;
 static ztest_ds_t *ztest_ds;

 static kmutex_t ztest_vdev_lock;
+static kmutex_t ztest_checkpoint_lock;

 /*
 * The ztest_name_lock protects the pool and dataset namespace used by
@ -2476,6 +2479,62 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
 	mutex_exit(&ztest_vdev_lock);
 }

+static void
+ztest_spa_checkpoint(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+	int error = spa_checkpoint(spa->spa_name);
+
+	switch (error) {
+	case 0:
+	case ZFS_ERR_DEVRM_IN_PROGRESS:
+	case ZFS_ERR_DISCARDING_CHECKPOINT:
+	case ZFS_ERR_CHECKPOINT_EXISTS:
+		break;
+	case ENOSPC:
+		ztest_record_enospc(FTAG);
+		break;
+	default:
+		fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error);
+	}
+}
+
+static void
+ztest_spa_discard_checkpoint(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+	int error = spa_checkpoint_discard(spa->spa_name);
+
+	switch (error) {
+	case 0:
+	case ZFS_ERR_DISCARDING_CHECKPOINT:
+	case ZFS_ERR_NO_CHECKPOINT:
+		break;
+	default:
+		fatal(0, "spa_discard_checkpoint(%s) = %d",
+		    spa->spa_name, error);
+	}
+
+}
+
+/* ARGSUSED */
+void
+ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+
+	mutex_enter(&ztest_checkpoint_lock);
+	if (ztest_random(2) == 0) {
+		ztest_spa_checkpoint(spa);
+	} else {
+		ztest_spa_discard_checkpoint(spa);
+	}
+	mutex_exit(&ztest_checkpoint_lock);
+}
+
+
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
@ -2556,8 +2615,15 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		rw_exit(&ztest_name_lock);

-		if (error && error != EEXIST)
+		switch (error) {
+		case 0:
+		case EEXIST:
+		case ZFS_ERR_CHECKPOINT_EXISTS:
+		case ZFS_ERR_DISCARDING_CHECKPOINT:
+			break;
+		default:
 			fatal(0, "spa_vdev_remove() = %d", error);
+		}
 	} else {
 		spa_config_exit(spa, SCL_VDEV, FTAG);

@ -2572,10 +2638,15 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 		error = spa_vdev_add(spa, nvroot);
 		nvlist_free(nvroot);

-		if (error == ENOSPC)
+		switch (error) {
+		case 0:
+			break;
+		case ENOSPC:
 			ztest_record_enospc("spa_vdev_add");
-		else if (error != 0)
+			break;
+		default:
 			fatal(0, "spa_vdev_add() = %d", error);
+		}
 	}

 	mutex_exit(&ztest_vdev_lock);
@ -2644,8 +2715,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
 		    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
 		error = spa_vdev_add(spa, nvroot);
-		if (error != 0)
+
+		switch (error) {
+		case 0:
+			break;
+		default:
 			fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
+		}
 		nvlist_free(nvroot);
 	} else {
 		/*
@ -2657,8 +2733,16 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 			(void) vdev_online(spa, guid, 0, NULL);

 		error = spa_vdev_remove(spa, guid, B_FALSE);
-		if (error != 0 && error != EBUSY)
+
+		switch (error) {
+		case 0:
+		case EBUSY:
+		case ZFS_ERR_CHECKPOINT_EXISTS:
+		case ZFS_ERR_DISCARDING_CHECKPOINT:
+			break;
+		default:
 			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+		}
 	}

 	mutex_exit(&ztest_vdev_lock);
@ -2757,7 +2841,6 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id)
 		--zs->zs_mirrors;
 	}
 	mutex_exit(&ztest_vdev_lock);
-
 }

 /*
@ -2856,7 +2939,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
 		if (error != 0 && error != ENODEV && error != EBUSY &&
-		    error != ENOTSUP)
+		    error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS &&
+		    error != ZFS_ERR_DISCARDING_CHECKPOINT)
 			fatal(0, "detach (%s) returned %d", oldpath, error);
 		mutex_exit(&ztest_vdev_lock);
 		return;
@ -2948,6 +3032,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	if (error == EOVERFLOW || error == EBUSY)
 		expected_error = error;

+	if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
+	    error == ZFS_ERR_DISCARDING_CHECKPOINT)
+		expected_error = error;
+
 	/* XXX workaround 6690467 */
 	if (error != expected_error && expected_error != EBUSY) {
 		fatal(0, "attach (%s %llu, %s %llu, %d) "
@ -3104,6 +3192,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 	uint64_t top;
 	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;

+	mutex_enter(&ztest_checkpoint_lock);
 	mutex_enter(&ztest_vdev_lock);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);

@ -3114,8 +3203,9 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 	 * when the device removal completes).
 	 */
 	if (spa->spa_vdev_removal != NULL) {
-		spa_config_exit(spa, SCL_STATE, FTAG);
+		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
 		return;
 	}

@ -3145,6 +3235,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 	    psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
 		return;
 	}
 	ASSERT(psize > 0);
@ -3170,6 +3261,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
 		return;
 	}

@ -3204,6 +3296,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
 		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
 		return;
 	}

@ -3234,6 +3327,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)

 	spa_config_exit(spa, SCL_STATE, spa);
 	mutex_exit(&ztest_vdev_lock);
+	mutex_exit(&ztest_checkpoint_lock);
 }

 /*
@ -5039,7 +5133,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 	 */
 	fd = open(pathrand, O_RDWR);

-	if (fd == -1)	/* we hit a gap in the device namespace */
+	if (fd == -1) /* we hit a gap in the device namespace */
 		return;

 	fsize = lseek(fd, 0, SEEK_END);
@ -5748,6 +5842,7 @@ ztest_run(ztest_shared_t *zs)
 	/*
 	 * Initialize parent/child shared state.
 	 */
+	mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL);
 	mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
 	rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);

@ -5809,7 +5904,7 @@ ztest_run(ztest_shared_t *zs)
 	    NULL) == 0);

 	/*
-	 * Verify that we can safely inquire about about any object,
+	 * Verify that we can safely inquire about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
@ -5912,6 +6007,7 @@ ztest_run(ztest_shared_t *zs)

 	rw_destroy(&ztest_name_lock);
 	mutex_destroy(&ztest_vdev_lock);
+	mutex_destroy(&ztest_checkpoint_lock);
 }

 static void
@ -6056,6 +6152,7 @@ ztest_init(ztest_shared_t *zs)
 	nvlist_t *nvroot, *props;

 	mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
+	mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL);
 	rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);

 	kernel_init(FREAD | FWRITE);
@ -6095,6 +6192,7 @@ ztest_init(ztest_shared_t *zs)

 	rw_destroy(&ztest_name_lock);
 	mutex_destroy(&ztest_vdev_lock);
+	mutex_destroy(&ztest_checkpoint_lock);
 }

 static void
--- a/common/zfs/zfeature_common.c
+++ b/common/zfs/zfeature_common.c
@ -20,7 +20,7 @@
 */

 /*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
 * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@ -225,6 +225,11 @@ zpool_feature_init(void)
 	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
 	    NULL);

+	zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+	    "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+	    "Pool state can be checkpointed, allowing rewind later.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
 	static const spa_feature_t large_blocks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
 		SPA_FEATURE_NONE
--- a/common/zfs/zfeature_common.h
+++ b/common/zfs/zfeature_common.h
@ -20,7 +20,7 @@
 */

 /*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
@ -58,6 +58,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_EDONR,
 	SPA_FEATURE_DEVICE_REMOVAL,
 	SPA_FEATURE_OBSOLETE_COUNTS,
+	SPA_FEATURE_POOL_CHECKPOINT,
 	SPA_FEATURES
 } spa_feature_t;

--- a/common/zfs/zpool_prop.c
+++ b/common/zfs/zpool_prop.c
@ -21,7 +21,7 @@
 /*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */

@ -82,6 +82,8 @@ zpool_prop_init(void)
 	    ZFS_TYPE_POOL, "<size>", "FREE");
 	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREEING");
+	zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
 	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "LEAKED");
 	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
--- a/lib/libzfs/common/libzfs.h
+++ b/lib/libzfs/common/libzfs.h
@ -131,6 +131,11 @@ typedef enum zfs_error {
 	EZFS_POOLREADONLY,	/* pool is in read-only mode */
 	EZFS_SCRUB_PAUSED,	/* scrub currently paused */
 	EZFS_NO_PENDING,	/* cannot cancel, no operation is pending */
+	EZFS_CHECKPOINT_EXISTS,	/* checkpoint exists */
+	EZFS_DISCARDING_CHECKPOINT,	/* currently discarding a checkpoint */
+	EZFS_NO_CHECKPOINT,	/* pool has no checkpoint */
+	EZFS_DEVRM_IN_PROGRESS,	/* a device is currently being removed */
+	EZFS_VDEV_TOO_BIG,	/* a device is too big to be used */
 	EZFS_UNKNOWN
 } zfs_error_t;

@ -417,6 +422,8 @@ extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
 extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
    nvlist_t *);
+extern int zpool_checkpoint(zpool_handle_t *);
+extern int zpool_discard_checkpoint(zpool_handle_t *);

 /*
 * Basic handle manipulations.  These functions do not create or destroy the
--- a/lib/libzfs/common/libzfs_pool.c
+++ b/lib/libzfs/common/libzfs_pool.c
@ -317,6 +317,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
 			break;
 		case ZPOOL_PROP_BOOTSIZE:
 		case ZPOOL_PROP_EXPANDSZ:
+		case ZPOOL_PROP_CHECKPOINT:
 			if (intval == 0) {
 				(void) strlcpy(buf, "-", len);
 			} else if (literal) {
@ -1280,6 +1281,48 @@ zpool_destroy(zpool_handle_t *zhp, const char *log_str)
 	return (0);
 }

+/*
+ * Create a checkpoint in the given pool.
+ */
+int
+zpool_checkpoint(zpool_handle_t *zhp)
+{
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	char msg[1024];
+	int error;
+
+	error = lzc_pool_checkpoint(zhp->zpool_name);
+	if (error != 0) {
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot checkpoint '%s'"), zhp->zpool_name);
+		(void) zpool_standard_error(hdl, error, msg);
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Discard the checkpoint from the given pool.
+ */
+int
+zpool_discard_checkpoint(zpool_handle_t *zhp)
+{
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	char msg[1024];
+	int error;
+
+	error = lzc_pool_checkpoint_discard(zhp->zpool_name);
+	if (error != 0) {
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot discard checkpoint in '%s'"), zhp->zpool_name);
+		(void) zpool_standard_error(hdl, error, msg);
+		return (-1);
+	}
+
+	return (0);
+}
+
 /*
 * Add the given vdevs to the pool.  The caller must have already performed the
 * necessary verification to ensure that the vdev specification is well-formed.
--- a/lib/libzfs/common/libzfs_util.c
+++ b/lib/libzfs/common/libzfs_util.c
@ -22,7 +22,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
 * Copyright (c) 2017 Datto Inc.
 */
@ -238,6 +238,17 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_NO_PENDING:
 		return (dgettext(TEXT_DOMAIN, "operation is not "
 		    "in progress"));
+	case EZFS_CHECKPOINT_EXISTS:
+		return (dgettext(TEXT_DOMAIN, "checkpoint exists"));
+	case EZFS_DISCARDING_CHECKPOINT:
+		return (dgettext(TEXT_DOMAIN, "currently discarding "
+		    "checkpoint"));
+	case EZFS_NO_CHECKPOINT:
+		return (dgettext(TEXT_DOMAIN, "checkpoint does not exist"));
+	case EZFS_DEVRM_IN_PROGRESS:
+		return (dgettext(TEXT_DOMAIN, "device removal in progress"));
+	case EZFS_VDEV_TOO_BIG:
+		return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@ -487,7 +498,21 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case ENOTACTIVE:
 		zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
 		break;
-
+	case ZFS_ERR_CHECKPOINT_EXISTS:
+		zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap);
+		break;
+	case ZFS_ERR_DISCARDING_CHECKPOINT:
+		zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap);
+		break;
+	case ZFS_ERR_NO_CHECKPOINT:
+		zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap);
+		break;
+	case ZFS_ERR_DEVRM_IN_PROGRESS:
+		zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap);
+		break;
+	case ZFS_ERR_VDEV_TOO_BIG:
+		zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
+		break;
 	default:
 		zfs_error_aux(hdl, strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
--- a/lib/libzfs_core/common/libzfs_core.c
+++ b/lib/libzfs_core/common/libzfs_core.c
@ -951,6 +951,74 @@ lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
 	    memlimit, argnvl, outnvl));
 }

+/*
+ * Creates a checkpoint for the specified pool.
+ *
+ * If this function returns 0 the pool was successfully checkpointed.
+ *
+ * This method may also return:
+ *
+ * ZFS_ERR_CHECKPOINT_EXISTS
+ *	The pool already has a checkpoint. A pools can only have one
+ *	checkpoint at most, at any given time.
+ *
+ * ZFS_ERR_DISCARDING_CHECKPOINT
+ * 	ZFS is in the middle of discarding a checkpoint for this pool.
+ * 	The pool can be checkpointed again once the discard is done.
+ *
+ * ZFS_DEVRM_IN_PROGRESS
+ * 	A vdev is currently being removed. The pool cannot be
+ * 	checkpointed until the device removal is done.
+ *
+ * ZFS_VDEV_TOO_BIG
+ * 	One or more top-level vdevs exceed the maximum vdev size
+ * 	supported for this feature.
+ */
+int
+lzc_pool_checkpoint(const char *pool)
+{
+	int error;
+
+	nvlist_t *result = NULL;
+	nvlist_t *args = fnvlist_alloc();
+
+	error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result);
+
+	fnvlist_free(args);
+	fnvlist_free(result);
+
+	return (error);
+}
+
+/*
+ * Discard the checkpoint from the specified pool.
+ *
+ * If this function returns 0 the checkpoint was successfully discarded.
+ *
+ * This method may also return:
+ *
+ * ZFS_ERR_NO_CHECKPOINT
+ * 	The pool does not have a checkpoint.
+ *
+ * ZFS_ERR_DISCARDING_CHECKPOINT
+ * 	ZFS is already in the middle of discarding the checkpoint.
+ */
+int
+lzc_pool_checkpoint_discard(const char *pool)
+{
+	int error;
+
+	nvlist_t *result = NULL;
+	nvlist_t *args = fnvlist_alloc();
+
+	error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result);
+
+	fnvlist_free(args);
+	fnvlist_free(result);
+
+	return (error);
+}
+
 /*
 * Executes a read-only channel program.
 *
--- a/lib/libzfs_core/common/libzfs_core.h
+++ b/lib/libzfs_core/common/libzfs_core.h
@ -92,6 +92,9 @@ int lzc_channel_program(const char *, const char *, uint64_t,
 int lzc_channel_program_nosync(const char *, const char *, uint64_t,
    uint64_t, nvlist_t *, nvlist_t **);

+int lzc_pool_checkpoint(const char *);
+int lzc_pool_checkpoint_discard(const char *);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/man/man1m/zdb.1m
+++ b/man/man1m/zdb.1m
@ -21,7 +21,7 @@
 .Nd display zpool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
-.Op Fl AbcdDFGhiLMPsvX
+.Op Fl AbcdDFGhikLMPsvX
 .Op Fl e Oo Fl V Oc Op Fl p Ar path ...
 .Op Fl I Ar inflight I/Os
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns ...
@ -170,6 +170,9 @@ Display information about intent log
 .Pq ZIL
 entries relating to each dataset.
 If specified multiple times, display counts of each intent log transaction type.
+.It Fl k
+Examine the checkpointed state of the pool.
+Note, the on disk format of the pool is not reverted to the checkpointed state.
 .It Fl l Ar device
 Read the vdev labels from the specified device.
 .Nm Fl l
--- a/man/man1m/zpool.1m
+++ b/man/man1m/zpool.1m
@ -43,6 +43,10 @@
 .Op Fl f
 .Ar pool device new_device
 .Nm
+.Cm checkpoint
+.Op Fl d, -discard
+.Ar pool
+.Nm
 .Cm clear
 .Ar pool
 .Op Ar device
@ -93,6 +97,7 @@
 .Cm import
 .Op Fl Dfm
 .Op Fl F Op Fl n
+.Op Fl -rewind-to-checkpoint
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
@ -451,6 +456,50 @@ configuration.
 .Pp
 The content of the cache devices is considered volatile, as is the case with
 other system caches.
+.Ss Pool checkpoint
+Before starting critical procedures that include destructive actions (e.g
+.Nm zfs Cm destroy
+), an administrator can checkpoint the pool's state and in the case of a
+mistake or failure, rewind the entire pool back to the checkpoint.
+Otherwise, the checkpoint can be discarded when the procedure has completed
+successfully.
+.Pp
+A pool checkpoint can be thought of as a pool-wide snapshot and should be used
+with care as it contains every part of the pool's state, from properties to vdev
+configuration.
+Thus, while a pool has a checkpoint certain operations are not allowed.
+Specifically, vdev removal/attach/detach, mirror splitting, and
+changing the pool's guid.
+Adding a new vdev is supported but in the case of a rewind it will have to be
+added again.
+Finally, users of this feature should keep in mind that scrubs in a pool that
+has a checkpoint do not repair checkpointed data.
+.Pp
+To create a checkpoint for a pool:
+.Bd -literal
+# zpool checkpoint pool
+.Ed
+.Pp
+To later rewind to its checkpointed state, you need to first export it and
+then rewind it during import:
+.Bd -literal
+# zpool export pool
+# zpool import --rewind-to-checkpoint pool
+.Ed
+.Pp
+To discard the checkpoint from a pool:
+.Bd -literal
+# zpool checkpoint -d pool
+.Ed
+.Pp
+Dataset reservations (controlled by the
+.Nm reservation
+or
+.Nm refreservation
+zfs properties) may be unenforceable while a checkpoint exists, because the
+checkpoint is allowed to consume the dataset's reservation.
+Finally, data that is part of the checkpoint but has been freed in the
+current state of the pool won't be scanned during a scrub.
 .Ss Properties
 Each pool has several properties associated with it.
 Some properties are read-only statistics while others are configurable and
@ -767,6 +816,39 @@ Not all devices can be overridden in this manner.
 .El
 .It Xo
 .Nm
+.Cm checkpoint
+.Op Fl d, -discard
+.Ar pool
+.Xc
+Checkpoints the current state of
+.Ar pool
+, which can be later restored by
+.Nm zpool Cm import --rewind-to-checkpoint .
+The existence of a checkpoint in a pool prohibits the following
+.Nm zpool
+commands:
+.Cm remove ,
+.Cm attach ,
+.Cm detach ,
+.Cm split ,
+and
+.Cm reguid .
+In addition, it may break reservation boundaries if the pool lacks free
+space.
+The
+.Nm zpool Cm status
+command indicates the existence of a checkpoint or the progress of discarding a
+checkpoint from a pool.
+The
+.Nm zpool Cm list
+command reports how much space the checkpoint takes from the pool.
+.Bl -tag -width Ds
+.It Fl d, -discard
+Discards an existing checkpoint from
+.Ar pool .
+.El
+.It Xo
+.Nm
 .Cm clear
 .Ar pool
 .Op Ar device
@ -1152,6 +1234,7 @@ property to
 .Cm import
 .Op Fl Dfm
 .Op Fl F Op Fl n
+.Op Fl -rewind-to-checkpoint
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
@ -1240,6 +1323,16 @@ and the
 .Sy altroot
 property to
 .Ar root .
+.It Fl -rewind-to-checkpoint
+Rewinds pool to the checkpointed state.
+Once the pool is imported with this flag there is no way to undo the rewind.
+All changes and data that were written after the checkpoint are lost!
+The only exception is when the
+.Sy readonly
+mounting option is enabled.
+In this case, the checkpointed state of the pool is opened and an
+administrator can see how the pool would look like if they were
+to fully rewind.
 .El
 .It Xo
 .Nm
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@ -1,5 +1,5 @@
 '\" te
-.\" Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+.\" Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
@ -470,6 +470,24 @@ used on a top-level vdev, and will never return to being \fBenabled\fR.
 .sp
 .ne 2
 .na
+\fB\fBzpool_checkpoint\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	com.delphix:zpool_checkpoint
+READ\-ONLY COMPATIBLE	yes
+DEPENDENCIES	none
+.TE
+
+This feature enables the "zpool checkpoint" subcommand that can
+checkpoint the state of the pool at the time it was issued and later
+rewind back to it or discard it.
+
+This feature becomes \fBactive\fR when the "zpool checkpoint" command
+is used to checkpoint the pool.
+The feature will only return back to being \fBenabled\fR when the pool
+is rewound or the checkpoint has been discarded.
 \fB\fBlarge_blocks\fR\fR
 .ad
 .RS 4n
--- a/uts/common/Makefile.files
+++ b/uts/common/Makefile.files
@ -1385,6 +1385,7 @@ ZFS_COMMON_OBJS +=		\
 	edonr_zfs.o		\
 	skein_zfs.o		\
 	spa.o			\
+	spa_checkpoint.o	\
 	spa_config.o		\
 	spa_errlog.o		\
 	spa_history.o		\
--- a/uts/common/fs/zfs/dmu_traverse.c
+++ b/uts/common/fs/zfs/dmu_traverse.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -31,6 +31,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dnode.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
 #include <sys/sa.h>
@ -80,8 +81,8 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 	if (BP_IS_HOLE(bp))
 		return (0);

-	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-		return (0);
+	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+		return (-1);

 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
@ -120,20 +121,17 @@ static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
-	zilog_t *zilog;

 	/*
 	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed; plus, in read-only mode, blocks that are already stable.
+	 * replayed; plus blocks that are already stable in read-only mode.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;

-	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
-
+	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 	    claim_txg);
-
 	zil_free(zilog);
 }

--- a/uts/common/fs/zfs/dnode.c
+++ b/uts/common/fs/zfs/dnode.c
@ -1083,6 +1083,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 	    (spa_is_root(os->os_spa) &&
 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));

+	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
 		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
@ -1176,7 +1178,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 		mutex_exit(&dn->dn_mtx);
 		zrl_remove(&dnh->dnh_zrlock);
 		dbuf_rele(db, FTAG);
-		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
+		return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST);
 	}
 	if (refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dnh);
--- a/uts/common/fs/zfs/dnode_sync.c
+++ b/uts/common/fs/zfs/dnode_sync.c
@ -608,7 +608,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec ||
-		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
+		    !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
 		dnp->dn_datablkszsec =
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 		dn->dn_next_blksz[txgoff] = 0;
--- a/uts/common/fs/zfs/dsl_dataset.c
+++ b/uts/common/fs/zfs/dsl_dataset.c
@ -46,6 +46,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
@ -205,7 +206,9 @@ int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
    boolean_t async)
 {
-	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	int used = bp_get_dsize_sync(spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);

@ -3717,7 +3720,8 @@ dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
 	ddsqra.ddsqra_value = refquota;

 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
-	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }

 static int
@ -3832,8 +3836,8 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
 	ddsqra.ddsqra_value = refreservation;

 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
-	    dsl_dataset_set_refreservation_sync, &ddsqra,
-	    0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }

 /*
--- a/uts/common/fs/zfs/dsl_destroy.c
+++ b/uts/common/fs/zfs/dsl_destroy.c
@ -1022,7 +1022,7 @@ dsl_destroy_head(const char *name)

 		error = dsl_sync_task(name, dsl_destroy_head_check,
 		    dsl_destroy_head_begin_sync, &ddha,
-		    0, ZFS_SPACE_CHECK_NONE);
+		    0, ZFS_SPACE_CHECK_DESTROY);
 		if (error != 0)
 			return (error);

@ -1047,7 +1047,7 @@ dsl_destroy_head(const char *name)
 	}

 	return (dsl_sync_task(name, dsl_destroy_head_check,
-	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
 }

 /*
--- a/uts/common/fs/zfs/dsl_dir.c
+++ b/uts/common/fs/zfs/dsl_dir.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2013 Martin Matuska. All rights reserved.
 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@ -921,14 +921,14 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
-		VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
-		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 	}
-	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	ddphys = dbuf->db_data;

@ -966,6 +966,12 @@ dsl_dir_get_used(dsl_dir_t *dd)
 	return (dsl_dir_phys(dd)->dd_used_bytes);
 }

+uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
 uint64_t
 dsl_dir_get_quota(dsl_dir_t *dd)
 {
@ -1193,7 +1199,8 @@ dsl_dir_space_available(dsl_dir_t *dd,
 		used += dsl_dir_space_towrite(dd);

 	if (dd->dd_parent == NULL) {
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+		    ZFS_SPACE_CHECK_NORMAL);
 		quota = MIN(quota, poolsize);
 	}

@ -1298,11 +1305,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 */
 	uint64_t deferred = 0;
 	if (dd->dd_parent == NULL) {
-		spa_t *spa = dd->dd_pool->dp_spa;
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
-		if (poolsize - deferred < quota) {
-			quota = poolsize - deferred;
+		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+		    (netfree) ?
+		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+		if (avail < quota) {
+			quota = avail;
 			retval = ENOSPC;
 		}
 	}
@ -1639,7 +1647,8 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 	ddsqra.ddsqra_value = quota;

 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dir_set_quota_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }

 int
@ -1682,7 +1691,8 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 		avail = dsl_dir_space_available(dd->dd_parent,
 		    NULL, 0, FALSE);
 	} else {
-		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+		avail = dsl_pool_adjustedsize(dd->dd_pool,
+		    ZFS_SPACE_CHECK_NORMAL) - used;
 	}

 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
@ -1761,7 +1771,8 @@ dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
 	ddsqra.ddsqra_value = reservation;

 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_dir_set_reservation_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }

 static dsl_dir_t *
--- a/uts/common/fs/zfs/dsl_pool.c
+++ b/uts/common/fs/zfs/dsl_pool.c
@ -44,6 +44,8 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
@ -197,6 +199,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
+	txg_list_create(&dp->dp_early_sync_tasks, spa,
+	    offsetof(dsl_sync_task_t, dst_node));

 	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
 	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
@ -373,6 +377,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
+	txg_list_destroy(&dp->dp_early_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);

 	taskq_destroy(dp->dp_zil_clean_taskq);
@ -545,6 +550,27 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 		cv_signal(&dp->dp_spaceavail_cv);
 }

+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+	spa_t *spa = dp->dp_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		txg_list_t *tl = &vd->vdev_ms_list;
+		metaslab_t *ms;
+
+		for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+		    ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+			VERIFY(range_tree_is_empty(ms->ms_freeing));
+			VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+		}
+	}
+
+	return (B_TRUE);
+}
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@ -560,6 +586,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)

 	tx = dmu_tx_create_assigned(dp, txg);

+	/*
+	 * Run all early sync tasks before writing out any dirty blocks.
+	 * For more info on early sync tasks see block comment in
+	 * dsl_early_sync_task().
+	 */
+	if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+		dsl_sync_task_t *dst;
+
+		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+		while ((dst =
+		    txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+			ASSERT(dsl_early_sync_task_verify(dp, txg));
+			dsl_sync_task_sync(dst, tx);
+		}
+		ASSERT(dsl_early_sync_task_verify(dp, txg));
+	}
+
 	/*
 	 * Write out all dirty blocks of dirty datasets.
 	 */
@ -714,22 +757,66 @@ dsl_pool_sync_context(dsl_pool_t *dp)
 	    taskq_member(dp->dp_sync_taskq, curthread));
 }

+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
 uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
 {
-	uint64_t space, resv;
+	spa_t *spa = dp->dp_spa;
+	uint64_t space, resv, adjustedsize;
+	uint64_t spa_deferred_frees =
+	    spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;

-	/*
-	 * If we're trying to assess whether it's OK to do a free,
-	 * cut the reservation in half to allow forward progress
-	 * (e.g. make it possible to rm(1) files from a full pool).
-	 */
-	space = spa_get_dspace(dp->dp_spa);
-	resv = spa_get_slop_space(dp->dp_spa);
-	if (netfree)
+	space = spa_get_dspace(spa)
+	    - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+	resv = spa_get_slop_space(spa);
+
+	switch (slop_policy) {
+	case ZFS_SPACE_CHECK_NORMAL:
+		break;
+	case ZFS_SPACE_CHECK_RESERVED:
 		resv >>= 1;
+		break;
+	case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+		resv >>= 2;
+		break;
+	case ZFS_SPACE_CHECK_NONE:
+		resv = 0;
+		break;
+	default:
+		panic("invalid slop policy value: %d", slop_policy);
+		break;
+	}
+	adjustedsize = (space >= resv) ? (space - resv) : 0;

-	return (space - resv);
+	return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+	uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+	uint64_t deferred =
+	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+	uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+	return (quota);
 }

 boolean_t
--- a/uts/common/fs/zfs/dsl_scan.c
+++ b/uts/common/fs/zfs/dsl_scan.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 * Copyright 2016 Gary Mills
 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 * Copyright 2017 Joyent, Inc.
@ -325,13 +326,23 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
+		 *
+		 * As the scrub does not currently support traversing
+		 * data that have been freed but are part of a checkpoint,
+		 * we don't mark the scrub as done in the DTLs as faults
+		 * may still exist in those vdevs.
 		 */
-		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
-		if (complete) {
+		if (complete &&
+		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+			    scn->scn_phys.scn_max_txg, B_TRUE);
+
 			spa_event_notify(spa, NULL, NULL,
 			    scn->scn_phys.scn_min_txg ?
 			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+		} else {
+			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+			    0, B_TRUE);
 		}
 		spa_errlog_rotate(spa);

@ -583,7 +594,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
 		return (0);

 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@ -634,11 +645,13 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;

+	ASSERT(spa_writeable(dp->dp_spa));
+
 	/*
-	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 * We only want to visit blocks that have been claimed
+	 * but not yet replayed.
 	 */
-	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+	if (claim_txg == 0)
 		return;

 	zilog = zil_alloc(dp->dp_meta_objset, zh);
@ -1562,61 +1575,16 @@ dsl_scan_active(dsl_scan_t *scn)
 	return (used != 0);
 }

-/* Called whenever a txg syncs. */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	int err = 0;

-	/*
-	 * Check for scn_restart_txg before checking spa_load_state, so
-	 * that we can restart an old-style scan while the pool is being
-	 * imported (see dsl_scan_init).
-	 */
-	if (dsl_scan_restarting(scn, tx)) {
-		pool_scan_func_t func = POOL_SCAN_SCRUB;
-		dsl_scan_done(scn, B_FALSE, tx);
-		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-			func = POOL_SCAN_RESILVER;
-		zfs_dbgmsg("restarting scan func=%u txg=%llu",
-		    func, tx->tx_txg);
-		dsl_scan_setup_sync(&func, tx);
-	}
+	if (spa_suspend_async_destroy(spa))
+		return (0);

-	/*
-	 * Only process scans in sync pass 1.
-	 */
-	if (spa_sync_pass(dp->dp_spa) > 1)
-		return;
-
-	/*
-	 * If the spa is shutting down, then stop scanning. This will
-	 * ensure that the scan does not dirty any new data during the
-	 * shutdown phase.
-	 */
-	if (spa_shutting_down(spa))
-		return;
-
-	/*
-	 * If the scan is inactive due to a stalled async destroy, try again.
-	 */
-	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
-		return;
-
-	scn->scn_visited_this_txg = 0;
-	scn->scn_suspending = B_FALSE;
-	scn->scn_sync_start_time = gethrtime();
-	spa->spa_scrub_active = B_TRUE;
-
-	/*
-	 * First process the async destroys.  If we suspend, don't do
-	 * any scrubbing or resilvering.  This ensures that there are no
-	 * async destroys while we are scanning, so the scan code doesn't
-	 * have to worry about traversing it.  It is also faster to free the
-	 * blocks than to scrub them.
-	 */
 	if (zfs_free_bpobj_enabled &&
 	    spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
@ -1690,7 +1658,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		ddt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
-		return;
+		return (err);
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    zfs_free_leak_on_eio &&
 	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
@ -1744,6 +1712,67 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 			dsl_pool_destroy_obsolete_bpobj(dp, tx);
 	}

+	return (0);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	int err = 0;
+
+	/*
+	 * Check for scn_restart_txg before checking spa_load_state, so
+	 * that we can restart an old-style scan while the pool is being
+	 * imported (see dsl_scan_init).
+	 */
+	if (dsl_scan_restarting(scn, tx)) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_done(scn, B_FALSE, tx);
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+			func = POOL_SCAN_RESILVER;
+		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+		    func, tx->tx_txg);
+		dsl_scan_setup_sync(&func, tx);
+	}
+
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+	if (spa_sync_pass(dp->dp_spa) > 1)
+		return;
+
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+		return;
+
+	scn->scn_visited_this_txg = 0;
+	scn->scn_suspending = B_FALSE;
+	scn->scn_sync_start_time = gethrtime();
+	spa->spa_scrub_active = B_TRUE;
+
+	/*
+	 * First process the async destroys.  If we pause, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
+	 */
+	err = dsl_process_async_destroys(dp, tx);
+	if (err != 0)
+		return;
+
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;

@ -2038,7 +2067,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 	}

 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }

 static boolean_t
--- a/uts/common/fs/zfs/dsl_synctask.c
+++ b/uts/common/fs/zfs/dsl_synctask.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 */

 #include <sys/dmu.h>
@ -39,33 +39,10 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
 	return (0);
 }

-/*
- * Called from open context to perform a callback in syncing context.  Waits
- * for the operation to complete.
- *
- * The checkfunc will be called from open context as a preliminary check
- * which can quickly fail.  If it succeeds, it will be called again from
- * syncing context.  The checkfunc should generally be designed to work
- * properly in either context, but if necessary it can check
- * dmu_tx_is_syncing(tx).
- *
- * The synctask infrastructure enforces proper locking strategy with respect
- * to the dp_config_rwlock -- the lock will always be held when the callbacks
- * are called.  It will be held for read during the open-context (preliminary)
- * call to the checkfunc, and then held for write from syncing context during
- * the calls to the check and sync funcs.
- *
- * A dataset or pool name can be passed as the first argument.  Typically,
- * the check func will hold, check the return value of the hold, and then
- * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
- * This is safe because no changes can be made between the check and sync funcs,
- * and the sync func will only be called if the check func successfully opened
- * the dataset.
- */
-int
-dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
    dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check)
+    int blocks_modified, zfs_space_check_t space_check, boolean_t early)
 {
 	spa_t *spa;
 	dmu_tx_t *tx;
@ -102,7 +79,9 @@ dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
 		return (err);
 	}

-	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
+	txg_list_t *task_list = (early) ?
+	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+	VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));

 	dmu_tx_commit(tx);

@ -117,9 +96,64 @@ dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
 	return (dst.dst_error);
 }

-void
-dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+/*
+ * Called from open context to perform a callback in syncing context.  Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail.  If it succeeds, it will be called again from
+ * syncing context.  The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called.  It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument.  Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+	    blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precodition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+	    blocks_modified, space_check, B_TRUE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
+    boolean_t early)
 {
 	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);

@ -133,7 +167,25 @@ dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
 	dst->dst_error = 0;
 	dst->dst_nowaiter = B_TRUE;

-	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
+	txg_list_t *task_list = (early) ?
+	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+	VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+	dsl_sync_task_nowait_common(dp, syncfunc, arg,
+	    blocks_modified, space_check, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+	dsl_sync_task_nowait_common(dp, syncfunc, arg,
+	    blocks_modified, space_check, tx, B_TRUE);
 }

 /*
@ -160,12 +212,12 @@ dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 	 * (arc_tempreserve, dsl_pool_tempreserve).
 	 */
 	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
-		uint64_t quota = dsl_pool_adjustedsize(dp,
-		    dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
-		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+		uint64_t quota = dsl_pool_unreserved_space(dp,
+		    dst->dst_space_check);
 		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
 		/* MOS space is triple-dittoed, so we multiply by 3. */
-		if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+		if (used + dst->dst_space * 3 > quota) {
 			dst->dst_error = SET_ERROR(ENOSPC);
 			if (dst->dst_nowaiter)
 				kmem_free(dst, sizeof (*dst));
--- a/uts/common/fs/zfs/dsl_userhold.c
+++ b/uts/common/fs/zfs/dsl_userhold.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2013 Steven Hartland. All rights reserved.
 */

@ -602,7 +602,8 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
 	ddura.ddura_chkholds = fnvlist_alloc();

 	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-	    dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
+	    dsl_dataset_user_release_sync, &ddura, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	fnvlist_free(ddura.ddura_todelete);
 	fnvlist_free(ddura.ddura_chkholds);

--- a/uts/common/fs/zfs/metaslab.c
+++ b/uts/common/fs/zfs/metaslab.c
@ -35,6 +35,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>

 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
@ -42,6 +43,14 @@
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */

+/*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+
 /*
 * The in-core space map representation is more compact than its on-disk form.
 * The zfs_condense_pct determines how much more compact the in-core
@ -201,7 +210,7 @@ uint64_t metaslab_trace_max_entries = 5000;

 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);

 kmem_cache_t *metaslab_alloc_trace_cache;
@ -486,11 +495,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocated +=
-		    range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}

-	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
-	    msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
+	msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+	    msp->ms_deferspace + range_tree_space(msp->ms_freed);

 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
@ -1028,9 +1037,9 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
 	metaslab_t *msp = arg;

 	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT(msp->ms_tree == NULL);
+	ASSERT(msp->ms_allocatable == NULL);

-	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }

@ -1043,10 +1052,10 @@ metaslab_rt_destroy(range_tree_t *rt, void *arg)
 	metaslab_t *msp = arg;

 	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	ASSERT0(avl_numnodes(&msp->ms_size_tree));
+	ASSERT3P(msp->ms_allocatable, ==, rt);
+	ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size));

-	avl_destroy(&msp->ms_size_tree);
+	avl_destroy(&msp->ms_allocatable_by_size);
 }

 static void
@ -1055,9 +1064,9 @@ metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 	metaslab_t *msp = arg;

 	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
+	ASSERT3P(msp->ms_allocatable, ==, rt);
 	VERIFY(!msp->ms_condensing);
-	avl_add(&msp->ms_size_tree, rs);
+	avl_add(&msp->ms_allocatable_by_size, rs);
 }

 static void
@ -1066,9 +1075,9 @@ metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 	metaslab_t *msp = arg;

 	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
+	ASSERT3P(msp->ms_allocatable, ==, rt);
 	VERIFY(!msp->ms_condensing);
-	avl_remove(&msp->ms_size_tree, rs);
+	avl_remove(&msp->ms_allocatable_by_size, rs);
 }

 static void
@ -1077,7 +1086,7 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg)
 	metaslab_t *msp = arg;

 	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
+	ASSERT3P(msp->ms_allocatable, ==, rt);

 	/*
 	 * Normally one would walk the tree freeing nodes along the way.
@ -1085,7 +1094,7 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg)
 	 * walking all nodes and just reinitialize the avl tree. The nodes
 	 * will be freed by the range tree, so we don't want to free them here.
 	 */
-	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }

@ -1109,7 +1118,7 @@ static range_tree_ops_t metaslab_rt_ops = {
 uint64_t
 metaslab_block_maxsize(metaslab_t *msp)
 {
-	avl_tree_t *t = &msp->ms_size_tree;
+	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;

 	if (t == NULL || (rs = avl_last(t)) == NULL)
@ -1184,7 +1193,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	avl_tree_t *t = &msp->ms_tree->rt_root;
+	avl_tree_t *t = &msp->ms_allocatable->rt_root;

 	return (metaslab_block_picker(t, cursor, size, align));
 }
@ -1213,13 +1222,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	range_tree_t *rt = msp->ms_tree;
+	range_tree_t *rt = msp->ms_allocatable;
 	avl_tree_t *t = &rt->rt_root;
 	uint64_t max_size = metaslab_block_maxsize(msp);
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+	ASSERT3U(avl_numnodes(t), ==,
+	    avl_numnodes(&msp->ms_allocatable_by_size));

 	if (max_size < size)
 		return (-1ULL);
@ -1230,7 +1240,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
-		t = &msp->ms_size_tree;
+		t = &msp->ms_allocatable_by_size;
 		*cursor = 0;
 	}

@ -1253,8 +1263,8 @@ static metaslab_ops_t metaslab_df_ops = {
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-	range_tree_t *rt = msp->ms_tree;
-	avl_tree_t *t = &msp->ms_size_tree;
+	range_tree_t *rt = msp->ms_allocatable;
+	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
@ -1267,7 +1277,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;

-		rs = avl_last(&msp->ms_size_tree);
+		rs = avl_last(&msp->ms_allocatable_by_size);
 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 			return (-1ULL);

@ -1303,7 +1313,7 @@ uint64_t metaslab_ndf_clump_shift = 4;
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &msp->ms_tree->rt_root;
+	avl_tree_t *t = &msp->ms_allocatable->rt_root;
 	avl_index_t where;
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
@ -1311,7 +1321,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	uint64_t max_size = metaslab_block_maxsize(msp);

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+	ASSERT3U(avl_numnodes(t), ==,
+	    avl_numnodes(&msp->ms_allocatable_by_size));

 	if (max_size < size)
 		return (-1ULL);
@ -1321,7 +1332,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)

 	rs = avl_find(t, &rsearch, &where);
 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
-		t = &msp->ms_size_tree;
+		t = &msp->ms_allocatable_by_size;

 		rsearch.rs_start = 0;
 		rsearch.rs_end = MIN(max_size,
@ -1385,13 +1396,15 @@ metaslab_load(metaslab_t *msp)

 	/*
 	 * If the space map has not been allocated yet, then treat
-	 * all the space in the metaslab as free and add it to the
-	 * ms_tree.
+	 * all the space in the metaslab as free and add it to ms_allocatable.
 	 */
-	if (msp->ms_sm != NULL)
-		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
-	else
-		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+	if (msp->ms_sm != NULL) {
+		error = space_map_load(msp->ms_sm, msp->ms_allocatable,
+		    SM_FREE);
+	} else {
+		range_tree_add(msp->ms_allocatable,
+		    msp->ms_start, msp->ms_size);
+	}

 	success = (error == 0);

@ -1402,9 +1415,16 @@ metaslab_load(metaslab_t *msp)
 		ASSERT3P(msp->ms_group, !=, NULL);
 		msp->ms_loaded = B_TRUE;

-		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			range_tree_walk(msp->ms_defertree[t],
-			    range_tree_remove, msp->ms_tree);
+		/*
+		 * If the metaslab already has a spacemap, then we need to
+		 * remove all segments from the defer tree; otherwise, the
+		 * metaslab is completely empty and we can skip this.
+		 */
+		if (msp->ms_sm != NULL) {
+			for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+				range_tree_walk(msp->ms_defer[t],
+				    range_tree_remove, msp->ms_allocatable);
+			}
 		}
 		msp->ms_max_size = metaslab_block_maxsize(msp);
 	}
@ -1416,7 +1436,7 @@ void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	range_tree_vacate(msp->ms_tree, NULL, NULL);
+	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 	msp->ms_max_size = 0;
@ -1462,7 +1482,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms);
+	ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
 	metaslab_group_add(mg, ms);

 	metaslab_set_fragmentation(ms);
@ -1514,20 +1534,21 @@ metaslab_fini(metaslab_t *msp)
 	space_map_close(msp->ms_sm);

 	metaslab_unload(msp);
-	range_tree_destroy(msp->ms_tree);
-	range_tree_destroy(msp->ms_freeingtree);
-	range_tree_destroy(msp->ms_freedtree);
+	range_tree_destroy(msp->ms_allocatable);
+	range_tree_destroy(msp->ms_freeing);
+	range_tree_destroy(msp->ms_freed);

 	for (int t = 0; t < TXG_SIZE; t++) {
-		range_tree_destroy(msp->ms_alloctree[t]);
+		range_tree_destroy(msp->ms_allocating[t]);
 	}

 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_destroy(msp->ms_defertree[t]);
+		range_tree_destroy(msp->ms_defer[t]);
 	}
-
 	ASSERT0(msp->ms_deferspace);

+	range_tree_destroy(msp->ms_checkpointing);
+
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
@ -1747,7 +1768,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;

 		segments <<= 1;
-		segments += msp->ms_tree->rt_histogram[i];
+		segments += msp->ms_allocatable->rt_histogram[i];

 		/*
 		 * The range tree provides more precision than the space map
@ -1963,7 +1984,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(size >= SPA_MINBLOCKSIZE ||
-	    range_tree_space(msp->ms_tree) == 0);
+	    range_tree_is_empty(msp->ms_allocatable));
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);

 	msp->ms_activation_weight = 0;
@ -2094,18 +2115,37 @@ metaslab_should_condense(metaslab_t *msp)
 	range_seg_t *rs;
 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
 	dmu_object_info_t doi;
-	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);

 	/*
-	 * Use the ms_size_tree range tree, which is ordered by size, to
-	 * obtain the largest segment in the free tree. We always condense
-	 * metaslabs that are empty and metaslabs for which a condense
-	 * request has been made.
+	 * Allocations and frees in early passes are generally more space
+	 * efficient (in terms of blocks described in space map entries)
+	 * than the ones in later passes (e.g. we don't compress after
+	 * sync pass 5) and condensing a metaslab multiple times in a txg
+	 * could degrade performance.
+	 *
+	 * Thus we prefer condensing each metaslab at most once every txg at
+	 * the earliest sync pass possible. If a metaslab is eligible for
+	 * condensing again after being considered for condensing within the
+	 * same txg, it will hopefully be dirty in the next txg where it will
+	 * be condensed at an earlier pass.
 	 */
-	rs = avl_last(&msp->ms_size_tree);
+	if (msp->ms_condense_checked_txg == current_txg)
+		return (B_FALSE);
+	msp->ms_condense_checked_txg = current_txg;
+
+	/*
+	 * Use the ms_allocatable_by_size range tree, which is ordered by
+	 * size, to obtain the largest segment in the free tree. We always
+	 * condense metaslabs that are empty and metaslabs for which a
+	 * condense request has been made.
+	 */
+	rs = avl_last(&msp->ms_allocatable_by_size);
 	if (rs == NULL || msp->ms_condense_wanted)
 		return (B_TRUE);

@ -2119,7 +2159,8 @@ metaslab_should_condense(metaslab_t *msp)
 	entries = size / (MIN(size, SM_RUN_MAX));
 	segsz = entries * sizeof (uint64_t);

-	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+	optimal_size =
+	    sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
 	object_size = space_map_length(msp->ms_sm);

 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
@ -2138,20 +2179,18 @@ metaslab_should_condense(metaslab_t *msp)
 static void
 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 {
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(msp->ms_loaded);

-
-	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
-	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+	    space_map_length(msp->ms_sm),
+	    avl_numnodes(&msp->ms_allocatable->rt_root),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");

 	msp->ms_condense_wanted = B_FALSE;
@ -2166,20 +2205,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	condense_tree = range_tree_create(NULL, NULL);
 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);

-	/*
-	 * Remove what's been freed in this txg from the condense_tree.
-	 * Since we're in sync_pass 1, we know that all the frees from
-	 * this txg are in the freeingtree.
-	 */
-	range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
+	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);

 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_walk(msp->ms_defertree[t],
+		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, condense_tree);
 	}

 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_remove, condense_tree);
 	}

@ -2189,13 +2224,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	 * metaslab's ms_condensing flag to ensure that
 	 * allocations on this metaslab do not occur while we're
 	 * in the middle of committing it to disk. This is only critical
-	 * for the ms_tree as all other range trees use per txg
+	 * for ms_allocatable as all other range trees use per txg
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;

 	mutex_exit(&msp->ms_lock);
-	space_map_truncate(sm, tx);
+	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);

 	/*
 	 * While we would ideally like to create a space map representation
@ -2211,7 +2246,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);

-	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
@ -2226,7 +2261,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
-	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(msp->ms_sm);

@ -2235,23 +2270,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
-	if (msp->ms_freeingtree == NULL) {
+	if (msp->ms_freeing == NULL) {
 		ASSERT3P(alloctree, ==, NULL);
 		return;
 	}

 	ASSERT3P(alloctree, !=, NULL);
-	ASSERT3P(msp->ms_freeingtree, !=, NULL);
-	ASSERT3P(msp->ms_freedtree, !=, NULL);
+	ASSERT3P(msp->ms_freeing, !=, NULL);
+	ASSERT3P(msp->ms_freed, !=, NULL);
+	ASSERT3P(msp->ms_checkpointing, !=, NULL);

 	/*
-	 * Normally, we don't want to process a metaslab if there
-	 * are no allocations or frees to perform. However, if the metaslab
-	 * is being forced to condense and it's loaded, we need to let it
-	 * through.
+	 * Normally, we don't want to process a metaslab if there are no
+	 * allocations or frees to perform. However, if the metaslab is being
+	 * forced to condense and it's loaded, we need to let it through.
 	 */
-	if (range_tree_space(alloctree) == 0 &&
-	    range_tree_space(msp->ms_freeingtree) == 0 &&
+	if (range_tree_is_empty(alloctree) &&
+	    range_tree_is_empty(msp->ms_freeing) &&
+	    range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted))
 		return;

@ -2260,10 +2296,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)

 	/*
 	 * The only state that can actually be changing concurrently with
-	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
-	 * be modifying this txg's alloctree, freeingtree, freedtree, or
-	 * space_map_phys_t.  We drop ms_lock whenever we could call
-	 * into the DMU, because the DMU can call down to us
+	 * metaslab_sync() is the metaslab's ms_allocatable.  No other
+	 * thread can be modifying this txg's alloc, freeing,
+	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
+	 * could call into the DMU, because the DMU can call down to us
 	 * (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
@ -2271,13 +2307,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * that the ms_lock is insufficient for this, because it is dropped
 	 * by space_map_write().
 	 */
-
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);

 	if (msp->ms_sm == NULL) {
 		uint64_t new_object;

-		new_object = space_map_alloc(mos, tx);
+		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);

 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
@ -2285,6 +2320,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		ASSERT(msp->ms_sm != NULL);
 	}

+	if (!range_tree_is_empty(msp->ms_checkpointing) &&
+	    vd->vdev_checkpoint_sm == NULL) {
+		ASSERT(spa_has_checkpoint(spa));
+
+		uint64_t new_object = space_map_alloc(mos,
+		    vdev_standard_sm_blksz, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+		/*
+		 * We save the space map object as an entry in vdev_top_zap
+		 * so it can be retrieved when the pool is reopened after an
+		 * export or through zdb.
+		 */
+		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (new_object), 1, &new_object, tx));
+	}
+
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);

@ -2297,16 +2354,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);

-	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
-	    metaslab_should_condense(msp)) {
+	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
 		mutex_enter(&msp->ms_lock);
 	}

+	if (!range_tree_is_empty(msp->ms_checkpointing)) {
+		ASSERT(spa_has_checkpoint(spa));
+		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+		/*
+		 * Since we are doing writes to disk and the ms_checkpointing
+		 * tree won't be changing during that time, we drop the
+		 * ms_lock while writing to the checkpoint space map.
+		 */
+		mutex_exit(&msp->ms_lock);
+		space_map_write(vd->vdev_checkpoint_sm,
+		    msp->ms_checkpointing, SM_FREE, tx);
+		mutex_enter(&msp->ms_lock);
+		space_map_update(vd->vdev_checkpoint_sm);
+
+		spa->spa_checkpoint_info.sci_dspace +=
+		    range_tree_space(msp->ms_checkpointing);
+		vd->vdev_stat.vs_checkpoint_space +=
+		    range_tree_space(msp->ms_checkpointing);
+		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+		    -vd->vdev_checkpoint_sm->sm_alloc);
+
+		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+	}
+
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
@ -2315,7 +2396,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
-		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);

 		/*
 		 * Since we've cleared the histogram we need to add back
@ -2324,7 +2405,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
-		space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
+		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);

 		/*
 		 * Add back any deferred free space that has not been
@ -2335,7 +2416,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
-			    msp->ms_defertree[t], tx);
+			    msp->ms_defer[t], tx);
 		}
 	}

@ -2346,7 +2427,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
-	space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
+	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);

 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
@ -2354,21 +2435,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)

 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
-	 * and instead will just swap the pointers for freeingtree and
-	 * freedtree. We can safely do this since the freed_tree is
+	 * and instead will just swap the pointers for freeing and
+	 * freed. We can safely do this since the freed_tree is
 	 * guaranteed to be empty on the initial pass.
 	 */
 	if (spa_sync_pass(spa) == 1) {
-		range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
+		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 	} else {
-		range_tree_vacate(msp->ms_freeingtree,
-		    range_tree_add, msp->ms_freedtree);
+		range_tree_vacate(msp->ms_freeing,
+		    range_tree_add, msp->ms_freed);
 	}
 	range_tree_vacate(alloctree, NULL, NULL);

-	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_freeingtree));
+	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+	    & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));

 	mutex_exit(&msp->ms_lock);

@ -2403,29 +2486,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * If this metaslab is just becoming available, initialize its
 	 * range trees and add its capacity to the vdev.
 	 */
-	if (msp->ms_freedtree == NULL) {
+	if (msp->ms_freed == NULL) {
 		for (int t = 0; t < TXG_SIZE; t++) {
-			ASSERT(msp->ms_alloctree[t] == NULL);
+			ASSERT(msp->ms_allocating[t] == NULL);

-			msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
+			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
 		}

-		ASSERT3P(msp->ms_freeingtree, ==, NULL);
-		msp->ms_freeingtree = range_tree_create(NULL, NULL);
+		ASSERT3P(msp->ms_freeing, ==, NULL);
+		msp->ms_freeing = range_tree_create(NULL, NULL);

-		ASSERT3P(msp->ms_freedtree, ==, NULL);
-		msp->ms_freedtree = range_tree_create(NULL, NULL);
+		ASSERT3P(msp->ms_freed, ==, NULL);
+		msp->ms_freed = range_tree_create(NULL, NULL);

 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			ASSERT(msp->ms_defertree[t] == NULL);
+			ASSERT(msp->ms_defer[t] == NULL);

-			msp->ms_defertree[t] = range_tree_create(NULL, NULL);
+			msp->ms_defer[t] = range_tree_create(NULL, NULL);
 		}

+		ASSERT3P(msp->ms_checkpointing, ==, NULL);
+		msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
 		vdev_space_update(vd, 0, 0, msp->ms_size);
 	}
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));

-	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];

 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
@ -2436,7 +2524,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	defer_delta = 0;
 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
 	if (defer_allowed) {
-		defer_delta = range_tree_space(msp->ms_freedtree) -
+		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
@ -2452,19 +2540,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)

 	/*
 	 * Move the frees from the defer_tree back to the free
-	 * range tree (if it's loaded). Swap the freed_tree and the
-	 * defer_tree -- this is safe to do because we've just emptied out
-	 * the defer_tree.
+	 * range tree (if it's loaded). Swap the freed_tree and
+	 * the defer_tree -- this is safe to do because we've
+	 * just emptied out the defer_tree.
 	 */
 	range_tree_vacate(*defer_tree,
-	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
-		range_tree_swap(&msp->ms_freedtree, defer_tree);
+		range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
-		range_tree_vacate(msp->ms_freedtree,
-		    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+		range_tree_vacate(msp->ms_freed,
+		    msp->ms_loaded ? range_tree_add : NULL,
+		    msp->ms_allocatable);
 	}
-
 	space_map_update(msp->ms_sm);

 	msp->ms_deferspace += defer_delta;
@ -2492,16 +2580,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
-			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
+			    msp->ms_allocating[(txg + t) & TXG_MASK]));
 		}

 		if (!metaslab_debug_unload)
 			metaslab_unload(msp);
 	}

-	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_freeingtree));
-	ASSERT0(range_tree_space(msp->ms_freedtree));
+	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_freed));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));

 	mutex_exit(&msp->ms_lock);
 }
@ -2705,7 +2794,7 @@ static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
 	uint64_t start;
-	range_tree_t *rt = msp->ms_tree;
+	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;

 	VERIFY(!msp->ms_condensing);
@ -2720,10 +2809,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);

-		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);

-		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);

 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
@ -3191,12 +3280,11 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,

 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;

-	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
@ -3210,11 +3298,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));

 	metaslab_check_free_impl(vd, offset, asize);
+
 	mutex_enter(&msp->ms_lock);
-	if (range_tree_space(msp->ms_freeingtree) == 0) {
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
+	if (range_tree_is_empty(msp->ms_freeing) &&
+	    range_tree_is_empty(msp->ms_checkpointing)) {
+		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+	}
+
+	if (checkpoint) {
+		ASSERT(spa_has_checkpoint(spa));
+		range_tree_add(msp->ms_checkpointing, offset, asize);
+	} else {
+		range_tree_add(msp->ms_freeing, offset, asize);
 	}
-	range_tree_add(msp->ms_freeingtree, offset, asize);
 	mutex_exit(&msp->ms_lock);
 }

@ -3223,23 +3319,25 @@ void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
    uint64_t size, void *arg)
 {
-	uint64_t *txgp = arg;
+	boolean_t *checkpoint = arg;
+
+	ASSERT3P(checkpoint, !=, NULL);

 	if (vd->vdev_ops->vdev_op_remap != NULL)
-		vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
-		metaslab_free_impl(vd, offset, size, *txgp);
+		metaslab_free_impl(vd, offset, size, *checkpoint);
 }

 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;

 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);

-	if (txg > spa_freeze_txg(spa))
+	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;

 	if (spa->spa_vdev_removal != NULL &&
@ -3251,13 +3349,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
-		free_from_removing_vdev(vd, offset, size, txg);
+		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
-		vdev_indirect_mark_obsolete(vd, offset, size, txg);
+		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
-		    metaslab_free_impl_cb, &txg);
+		    metaslab_free_impl_cb, &checkpoint);
 	} else {
-		metaslab_free_concrete(vd, offset, size, txg);
+		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }

@ -3434,26 +3532,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];

 	mutex_enter(&msp->ms_lock);
-	range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);

 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
-	VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-	range_tree_add(msp->ms_tree, offset, size);
+	range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }

 /*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * Free the block represented by the given DVA.
 */
 void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
@ -3467,7 +3564,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 	}

-	metaslab_free_impl(vd, offset, size, txg);
+	metaslab_free_impl(vd, offset, size, checkpoint);
 }

 /*
@ -3537,7 +3634,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);

-	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+	if (error == 0 &&
+	    !range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);

 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
@ -3548,13 +3646,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
-	range_tree_remove(msp->ms_tree, offset, size);
+	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+	    msp->ms_size);
+	range_tree_remove(msp->ms_allocatable, offset, size);

 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+		    offset, size);
 	}

 	mutex_exit(&msp->ms_lock);
@ -3699,13 +3799,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));

+	/*
+	 * If we have a checkpoint for the pool we need to make sure that
+	 * the blocks that we free that are part of the checkpoint won't be
+	 * reused until the checkpoint is discarded or we revert to it.
+	 *
+	 * The checkpoint flag is passed down the metaslab_free code path
+	 * and is set whenever we want to add a block to the checkpoint's
+	 * accounting. That is, we "checkpoint" blocks that existed at the
+	 * time the checkpoint was created and are therefore referenced by
+	 * the checkpointed uberblock.
+	 *
+	 * Note that, we don't checkpoint any blocks if the current
+	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+	 * normally as they will be referenced by the checkpointed uberblock.
+	 */
+	boolean_t checkpoint = B_FALSE;
+	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+		/*
+		 * At this point, if the block is part of the checkpoint
+		 * there is no way it was created in the current txg.
+		 */
+		ASSERT(!now);
+		ASSERT3U(spa_syncing_txg(spa), ==, txg);
+		checkpoint = B_TRUE;
+	}
+
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);

 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
-			metaslab_free_dva(spa, &dva[d], txg);
+			ASSERT3U(txg, ==, spa_syncing_txg(spa));
+			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}

@ -3777,12 +3905,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)

 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded)
-		range_tree_verify(msp->ms_tree, offset, size);
+		range_tree_verify(msp->ms_allocatable, offset, size);

-	range_tree_verify(msp->ms_freeingtree, offset, size);
-	range_tree_verify(msp->ms_freedtree, offset, size);
+	range_tree_verify(msp->ms_freeing, offset, size);
+	range_tree_verify(msp->ms_checkpointing, offset, size);
+	range_tree_verify(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
-		range_tree_verify(msp->ms_defertree[j], offset, size);
+		range_tree_verify(msp->ms_defer[j], offset, size);
 	mutex_exit(&msp->ms_lock);
 }

--- a/uts/common/fs/zfs/range_tree.c
+++ b/uts/common/fs/zfs/range_tree.c
@ -23,7 +23,7 @@
 * Use is subject to license terms.
 */
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -391,7 +391,6 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 {
 	range_seg_t *rs;

-
 	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 }
@ -401,3 +400,10 @@ range_tree_space(range_tree_t *rt)
 {
 	return (rt->rt_space);
 }
+
+boolean_t
+range_tree_is_empty(range_tree_t *rt)
+{
+	ASSERT(rt != NULL);
+	return (range_tree_space(rt) == 0);
+}
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@ -151,8 +151,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
-    boolean_t reloading);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);

 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
@ -216,6 +215,7 @@ uint64_t	zfs_max_missing_tvds = 0;
 * and we get a chance to retrieve the trusted config.
 */
 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
 /*
 * In the case where config was assembled by scanning device paths (/dev/dsks
 * by default) we are less tolerant since all the existing devices should have
@ -223,6 +223,11 @@ uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 */
 uint64_t	zfs_max_missing_tvds_scan = 0;

+/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t	zfs_pause_spa_sync = B_FALSE;
+
 /*
 * ==========================================================================
 * SPA properties routines
@ -274,6 +279,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+		    spa->spa_checkpoint_info.sci_dspace, src);

 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
@ -785,6 +792,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx)
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;

+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		int error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (SET_ERROR(error));
+	}
+
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
@ -1344,6 +1357,12 @@ spa_unload(spa_t *spa)
 		spa->spa_condense_zthr = NULL;
 	}

+	if (spa->spa_checkpoint_discard_zthr != NULL) {
+		ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
+		zthr_destroy(spa->spa_checkpoint_discard_zthr);
+		spa->spa_checkpoint_discard_zthr = NULL;
+	}
+
 	spa_condense_fini(spa);

 	bpobj_close(&spa->spa_deferred_bpobj);
@ -1427,6 +1446,18 @@ spa_load_spares(spa_t *spa)
 	int i;
 	vdev_t *vd, *tvd;

+#ifndef _KERNEL
+	/*
+	 * zdb opens both the current state of the pool and the
+	 * checkpointed state (if present), with a different spa_t.
+	 *
+	 * As spare vdevs are shared among open pools, we skip loading
+	 * them when we load the checkpointed state of the pool.
+	 */
+	if (!spa_writeable(spa))
+		return;
+#endif
+
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

 	/*
@ -1546,6 +1577,19 @@ spa_load_l2cache(spa_t *spa)
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;

+#ifndef _KERNEL
+	/*
+	 * zdb opens both the current state of the pool and the
+	 * checkpointed state (if present), with a different spa_t.
+	 *
+	 * As L2 caches are part of the ARC which is shared among open
+	 * pools, we skip loading them when we load the checkpointed
+	 * state of the pool.
+	 */
+	if (!spa_writeable(spa))
+		return;
+#endif
+
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

 	if (sav->sav_config != NULL) {
@ -2093,6 +2137,11 @@ spa_spawn_aux_threads(spa_t *spa)
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));

 	spa_start_indirect_condensing_thread(spa);
+
+	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+	spa->spa_checkpoint_discard_zthr =
+	    zthr_create(spa_checkpoint_discard_thread_check,
+	    spa_checkpoint_discard_thread, spa);
 }

 /*
@ -2186,7 +2235,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 	spa->spa_load_state = state;

 	gethrestime(&spa->spa_loaded_ts);
-	error = spa_load_impl(spa, type, &ereport, B_FALSE);
+	error = spa_load_impl(spa, type, &ereport);

 	/*
 	 * Don't count references from objsets that are already closed
@ -2291,8 +2340,25 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 		return (SET_ERROR(EINVAL));
 	}

-	if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
-	    SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
+	/*
+	 * If we are doing an import, ensure that the pool is not already
+	 * imported by checking if its pool guid already exists in the
+	 * spa namespace.
+	 *
+	 * The only case that we allow an already imported pool to be
+	 * imported again, is when the pool is checkpointed and we want to
+	 * look at its checkpointed state from userland tools like zdb.
+	 */
+#ifdef _KERNEL
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+#else
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0) &&
+	    !spa_importing_readonly_checkpoint(spa)) {
+#endif
 		spa_load_failed(spa, "a pool with guid %llu is already open",
 		    (u_longlong_t)pool_guid);
 		return (SET_ERROR(EEXIST));
@ -2451,6 +2517,19 @@ spa_ld_validate_vdevs(spa_t *spa)
 	return (0);
 }

+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+	spa->spa_claim_max_txg = spa->spa_first_txg;
+	spa->spa_prev_software_version = ub->ub_software_version;
+}
+
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
@ -2458,6 +2537,29 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 	nvlist_t *label;
 	uberblock_t *ub = &spa->spa_uberblock;

+	/*
+	 * If we are opening the checkpointed state of the pool by
+	 * rewinding to it, at this point we will have written the
+	 * checkpointed uberblock to the vdev labels, so searching
+	 * the labels will find the right uberblock.  However, if
+	 * we are opening the checkpointed state read-only, we have
+	 * not modified the labels. Therefore, we must ignore the
+	 * labels and continue using the spa_uberblock that was set
+	 * by spa_ld_checkpoint_rewind.
+	 *
+	 * Note that it would be fine to ignore the labels when
+	 * rewinding (opening writeable) as well. However, if we
+	 * crash just after writing the labels, we will end up
+	 * searching the labels. Doing so in the common case means
+	 * that this code path gets exercised normally, rather than
+	 * just in the edge case.
+	 */
+	if (ub->ub_checkpoint_txg != 0 &&
+	    spa_importing_readonly_checkpoint(spa)) {
+		spa_ld_select_uberblock_done(spa, ub);
+		return (0);
+	}
+
 	/*
 	 * Find the best uberblock.
 	 */
@ -2560,14 +2662,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 	/*
 	 * Initialize internal SPA structures.
 	 */
-	spa->spa_state = POOL_STATE_ACTIVE;
-	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
-	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
-	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
-	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
-	spa->spa_claim_max_txg = spa->spa_first_txg;
-	spa->spa_prev_software_version = ub->ub_software_version;
+	spa_ld_select_uberblock_done(spa, ub);

 	return (0);
 }
@ -2590,7 +2685,7 @@ spa_ld_open_rootbp(spa_t *spa)
 }

 static int
-spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
    boolean_t reloading)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
@ -3251,7 +3346,7 @@ spa_ld_claim_log_blocks(spa_t *spa)

 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
-    boolean_t reloading)
+    boolean_t update_config_cache)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
@ -3263,7 +3358,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
-	if (reloading || config_cache_txg != spa->spa_config_txg ||
+	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
@ -3299,18 +3394,38 @@ spa_ld_prepare_for_reload(spa_t *spa)
 	spa->spa_async_suspended = async_suspended;
 }

-/*
- * Load an existing storage pool, using the config provided. This config
- * describes which vdevs are part of the pool and is later validated against
- * partial configs present in each vdev's label and an entire copy of the
- * config stored in the MOS.
- */
 static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
-    boolean_t reloading)
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error = 0;
+
+	ASSERT0(spa->spa_checkpoint_txg);
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error == ENOENT)
+		return (0);
+
+	if (error != 0)
+		return (error);
+
+	ASSERT3U(checkpoint.ub_txg, !=, 0);
+	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+	spa->spa_checkpoint_txg = checkpoint.ub_txg;
+	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+	return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
-	boolean_t missing_feat_write = B_FALSE;

 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@ -3326,11 +3441,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 	if (type != SPA_IMPORT_ASSEMBLE)
 		spa->spa_trust_config = B_FALSE;

-	if (reloading)
-		spa_load_note(spa, "RELOADING");
-	else
-		spa_load_note(spa, "LOADING");
-
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
@ -3363,11 +3473,11 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 	}

 	/*
-	 * Read vdev labels to find the best uberblock (i.e. latest, unless
-	 * spa_load_max_txg is set) and store it in spa_uberblock. We get the
-	 * list of features required to read blkptrs in the MOS from the vdev
-	 * label with the best uberblock and verify that our version of zfs
-	 * supports them all.
+	 * Read all vdev labels to find the best uberblock (i.e. latest,
+	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+	 * get the list of features required to read blkptrs in the MOS from
+	 * the vdev label with the best uberblock and verify that our version
+	 * of zfs supports them all.
 	 */
 	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
@ -3382,23 +3492,211 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 	if (error != 0)
 		return (error);

+	return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error != 0) {
+		spa_load_failed(spa, "unable to retrieve checkpointed "
+		    "uberblock from the MOS config [error=%d]", error);
+
+		if (error == ENOENT)
+			error = ZFS_ERR_NO_CHECKPOINT;
+
+		return (error);
+	}
+
+	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+	/*
+	 * We need to update the txg and timestamp of the checkpointed
+	 * uberblock to be higher than the latest one. This ensures that
+	 * the checkpointed uberblock is selected if we were to close and
+	 * reopen the pool right after we've written it in the vdev labels.
+	 * (also see block comment in vdev_uberblock_compare)
+	 */
+	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+	checkpoint.ub_timestamp = gethrestime_sec();
+
+	/*
+	 * Set current uberblock to be the checkpointed uberblock.
+	 */
+	spa->spa_uberblock = checkpoint;
+
+	/*
+	 * If we are doing a normal rewind, then the pool is open for
+	 * writing and we sync the "updated" checkpointed uberblock to
+	 * disk. Once this is done, we've basically rewound the whole
+	 * pool and there is no way back.
+	 *
+	 * There are cases when we don't want to attempt and sync the
+	 * checkpointed uberblock to disk because we are opening a
+	 * pool as read-only. Specifically, verifying the checkpointed
+	 * state with zdb, and importing the checkpointed state to get
+	 * a "preview" of its content.
+	 */
+	if (spa_writeable(spa)) {
+		vdev_t *rvd = spa->spa_root_vdev;
+
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+		int svdcount = 0;
+		int children = rvd->vdev_children;
+		int c0 = spa_get_random(children);
+
+		for (int c = 0; c < children; c++) {
+			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+			/* Stop when revisiting the first vdev */
+			if (c > 0 && svd[0] == vd)
+				break;
+
+			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+			    !vdev_is_concrete(vd))
+				continue;
+
+			svd[svdcount++] = vd;
+			if (svdcount == SPA_SYNC_MIN_VDEVS)
+				break;
+		}
+		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+		spa_config_exit(spa, SCL_ALL, FTAG);
+
+		if (error != 0) {
+			spa_load_failed(spa, "failed to write checkpointed "
+			    "uberblock to the vdev labels [error=%d]", error);
+			return (error);
+		}
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t *update_config_cache)
+{
+	int error;
+
+	/*
+	 * Parse the config for pool, open and validate vdevs,
+	 * select an uberblock, and use that uberblock to open
+	 * the MOS.
+	 */
+	error = spa_ld_mos_init(spa, type);
+	if (error != 0)
+		return (error);
+
 	/*
 	 * Retrieve the trusted config stored in the MOS and use it to create
 	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
-	error = spa_ld_load_trusted_config(spa, type, reloading);
+	error = spa_ld_trusted_config(spa, type, B_FALSE);
 	if (error == EAGAIN) {
-		VERIFY(!reloading);
+		if (update_config_cache != NULL)
+			*update_config_cache = B_TRUE;
+
 		/*
 		 * Redo the loading process with the trusted config if it is
 		 * too different from the untrusted config.
 		 */
 		spa_ld_prepare_for_reload(spa);
-		return (spa_load_impl(spa, type, ereport, B_TRUE));
+		spa_load_note(spa, "RELOADING");
+		error = spa_ld_mos_init(spa, type);
+		if (error != 0)
+			return (error);
+
+		error = spa_ld_trusted_config(spa, type, B_TRUE);
+		if (error != 0)
+			return (error);
+
 	} else if (error != 0) {
 		return (error);
 	}

+	return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+	int error = 0;
+	boolean_t missing_feat_write = B_FALSE;
+	boolean_t checkpoint_rewind =
+	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+	boolean_t update_config_cache = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+	spa_load_note(spa, "LOADING");
+
+	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * If we are rewinding to the checkpoint then we need to repeat
+	 * everything we've done so far in this function but this time
+	 * selecting the checkpointed uberblock and using that to open
+	 * the MOS.
+	 */
+	if (checkpoint_rewind) {
+		/*
+		 * If we are rewinding to the checkpoint update config cache
+		 * anyway.
+		 */
+		update_config_cache = B_TRUE;
+
+		/*
+		 * Extract the checkpointed uberblock from the current MOS
+		 * and use this as the pool's uberblock from now on. If the
+		 * pool is imported as writeable we also write the checkpoint
+		 * uberblock to the labels, making the rewind permanent.
+		 */
+		error = spa_ld_checkpoint_rewind(spa);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Redo the loading process process again with the
+		 * checkpointed uberblock.
+		 */
+		spa_ld_prepare_for_reload(spa);
+		spa_load_note(spa, "LOADING checkpointed uberblock");
+		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * Retrieve the checkpoint txg if the pool has a checkpoint.
+	 */
+	error = spa_ld_read_checkpoint_txg(spa);
+	if (error != 0)
+		return (error);
+
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 	 * from the pool and their contents were re-mapped to other vdevs. Note
@ -3501,6 +3799,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,

 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);

+		/*
+		 * In case of a checkpoint rewind, log the original txg
+		 * of the checkpointed uberblock.
+		 */
+		if (checkpoint_rewind) {
+			spa_history_log_internal(spa, "checkpoint rewind",
+			    NULL, "rewound state to txg=%llu",
+			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+		}
+
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
@ -3527,7 +3835,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
 		spa_ld_check_for_config_update(spa, config_cache_txg,
-		    reloading);
+		    update_config_cache);

 		/*
 		 * Check all DTLs to see if anything needs resilvering.
@ -3611,6 +3919,15 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
+	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+		/*
+		 * When attempting checkpoint-rewind on a pool with no
+		 * checkpoint, we should not attempt to load uberblocks
+		 * from previous txgs when spa_load fails.
+		 */
+		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+		return (load_error);
+	}

 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
@ -5302,6 +5619,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)

 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);

+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
 	if (spa->spa_vdev_removal != NULL ||
 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
@ -5511,6 +5835,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)

 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);

+	/*
+	 * Besides being called directly from the userland through the
+	 * ioctl interface, spa_vdev_detach() can be potentially called
+	 * at the end of spa_vdev_resilver_done().
+	 *
+	 * In the regular case, when we have a checkpoint this shouldn't
+	 * happen as we never empty the DTLs of a vdev during the scrub
+	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+	 * should never get here when we have a checkpoint.
+	 *
+	 * That said, even in a case when we checkpoint the pool exactly
+	 * as spa_vdev_resilver_done() calls this function everything
+	 * should be fine as the resilver will return right away.
+	 */
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));

@ -5749,6 +6094,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,

 	txg = spa_vdev_enter(spa);

+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
@ -6414,6 +6766,10 @@ spa_async_suspend(spa_t *spa)
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL && zthr_isrunning(condense_thread))
 		VERIFY0(zthr_cancel(condense_thread));
+
+	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+	if (discard_thread != NULL && zthr_isrunning(discard_thread))
+		VERIFY0(zthr_cancel(discard_thread));
 }

 void
@ -6428,6 +6784,10 @@ spa_async_resume(spa_t *spa)
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL && !zthr_isrunning(condense_thread))
 		zthr_resume(condense_thread);
+
+	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+	if (discard_thread != NULL && !zthr_isrunning(discard_thread))
+		zthr_resume(discard_thread);
 }

 static boolean_t
@ -7198,6 +7558,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 				    txg));
 				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+				ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
+				    txg));
 				break;
 			}
 			spa_sync_deferred_frees(spa, tx);
@ -7241,16 +7603,22 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

 		if (list_is_empty(&spa->spa_config_dirty_list)) {
-			vdev_t *svd[SPA_SYNC_MIN_VDEVS];
+			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);

 			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
+
+				/* Stop when revisiting the first vdev */
+				if (c > 0 && svd[0] == vd)
+					break;
+
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
+
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
@ -7313,6 +7681,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));

+	while (zfs_pause_spa_sync)
+		delay(1);
+
 	spa->spa_sync_pass = 0;

 	/*
--- a/uts/common/fs/zfs/spa_misc.c
+++ b/uts/common/fs/zfs/spa_misc.c
@ -338,12 +338,15 @@ int spa_asize_inflation = 24;
 * These are the operations that call dsl_pool_adjustedsize() with the netfree
 * argument set to TRUE.
 *
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
 * A very restricted set of operations are always permitted, regardless of
 * the amount of free space.  These are the operations that call
- * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
- * operations result in a net increase in the amount of space used,
- * it is possible to run the pool completely out of space, causing it to
- * be permanently read-only.
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
 *
 * Note that on very small pools, the slop space will be larger than
 * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
@ -1717,6 +1720,12 @@ spa_get_dspace(spa_t *spa)
 	return (spa->spa_dspace);
 }

+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+	return (spa->spa_checkpoint_info.sci_dspace);
+}
+
 void
 spa_update_dspace(spa_t *spa)
 {
@ -2026,7 +2035,8 @@ spa_writeable(spa_t *spa)
 boolean_t
 spa_has_pending_synctask(spa_t *spa)
 {
-	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
 }

 int
@ -2182,3 +2192,60 @@ spa_set_missing_tvds(spa_t *spa, uint64_t missing)
 {
 	spa->spa_missing_tvds = missing;
 }
+
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+			return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+	return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+	    spa->spa_mode == FREAD);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+	if (checkpoint_txg != 0)
+		return (checkpoint_txg + 1);
+
+	return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	uint64_t unreserved = dsl_pool_unreserved_space(dp,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
+	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+	if (spa_has_checkpoint(spa) && avail == 0)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
--- a/uts/common/fs/zfs/space_map.c
+++ b/uts/common/fs/zfs/space_map.c
@ -23,7 +23,7 @@
 * Use is subject to license terms.
 */
 /*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -38,12 +38,13 @@
 #include <sys/zfeature.h>

 /*
+ * Note on space map block size:
+ *
 * The data for a given space map can be kept on blocks of any size.
 * Larger blocks entail fewer i/o operations, but they also cause the
 * DMU to keep more data in-core, and also to waste more i/o bandwidth
 * when only a few blocks have changed since the last transaction group.
 */
-int space_map_blksz = (1 << 12);

 /*
 * Iterate through the space map, invoking the callback on each (non-debug)
@ -105,6 +106,137 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 	return (error);
 }

+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+    dmu_tx_t *tx)
+{
+	uint64_t bufsize, len;
+	uint64_t *entry_map;
+	int error = 0;
+
+	len = space_map_length(sm);
+	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	entry_map = zio_buf_alloc(bufsize);
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	/*
+	 * Since we can't move the starting offset of the space map
+	 * (e.g there are reference on-disk pointing to it), we destroy
+	 * its entries incrementally starting from the end.
+	 *
+	 * The logic that follows is basically the same as the one used
+	 * in space_map_iterate() but it traverses the space map
+	 * backwards:
+	 *
+	 * 1] We figure out the size of the buffer that we want to use
+	 *    to read the on-disk space map entries.
+	 * 2] We figure out the offset at the end of the space map where
+	 *    we will start reading entries into our buffer.
+	 * 3] We read the on-disk entries into the buffer.
+	 * 4] We iterate over the entries from end to beginning calling
+	 *    the callback function on each one. As we move from entry
+	 *    to entry we decrease the size of the space map, deleting
+	 *    effectively each entry.
+	 * 5] If there are no more entries in the space map or the
+	 *    callback returns a value other than 0, we stop iterating
+	 *    over the space map. If there are entries remaining and
+	 *    the callback returned zero we go back to step [1].
+	 */
+	uint64_t offset = 0, size = 0;
+	while (len > 0 && error == 0) {
+		size = MIN(bufsize, len);
+
+		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+		VERIFY3U(size, >, 0);
+		ASSERT3U(sm->sm_blksz, !=, 0);
+
+		offset = len - size;
+
+		IMPLY(bufsize > len, offset == 0);
+		IMPLY(bufsize == len, offset == 0);
+		IMPLY(bufsize < len, offset > 0);
+
+
+		EQUIV(size == len, offset == 0);
+		IMPLY(size < len, bufsize < len);
+
+		dprintf("object=%llu  offset=%llx  size=%llx\n",
+		    space_map_object(sm), offset, size);
+
+		error = dmu_read(sm->sm_os, space_map_object(sm),
+		    offset, size, entry_map, DMU_READ_PREFETCH);
+		if (error != 0)
+			break;
+
+		uint64_t num_entries = size / sizeof (uint64_t);
+
+		ASSERT3U(num_entries, >, 0);
+
+		while (num_entries > 0) {
+			uint64_t e, entry_offset, entry_size;
+			maptype_t type;
+
+			e = entry_map[num_entries - 1];
+
+			ASSERT3U(num_entries, >, 0);
+			ASSERT0(error);
+
+			if (SM_DEBUG_DECODE(e)) {
+				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				space_map_update(sm);
+				len -= sizeof (uint64_t);
+				num_entries--;
+				continue;
+			}
+
+			type = SM_TYPE_DECODE(e);
+			entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+			    sm->sm_start;
+			entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+			VERIFY3U(entry_offset, >=, sm->sm_start);
+			VERIFY3U(entry_offset + entry_size, <=,
+			    sm->sm_start + sm->sm_size);
+
+			error = callback(type, entry_offset, entry_size, arg);
+			if (error != 0)
+				break;
+
+			if (type == SM_ALLOC)
+				sm->sm_phys->smp_alloc -= entry_size;
+			else
+				sm->sm_phys->smp_alloc += entry_size;
+
+			sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+			space_map_update(sm);
+			len -= sizeof (uint64_t);
+			num_entries--;
+		}
+		IMPLY(error == 0, num_entries == 0);
+		EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
+	}
+
+	if (len == 0) {
+		ASSERT0(error);
+		ASSERT0(offset);
+		ASSERT0(sm->sm_length);
+		ASSERT0(sm->sm_phys->smp_objsize);
+		ASSERT0(sm->sm_alloc);
+	}
+
+	zio_buf_free(entry_map, bufsize);
+	return (error);
+}
+
 typedef struct space_map_load_arg {
 	space_map_t	*smla_sm;
 	range_tree_t	*smla_rt;
@ -279,7 +411,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 */
 	sm->sm_phys->smp_object = sm->sm_object;

-	if (range_tree_space(rt) == 0) {
+	if (range_tree_is_empty(rt)) {
 		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
 		return;
 	}
@ -413,7 +545,7 @@ space_map_close(space_map_t *sm)
 }

 void
-space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 {
 	objset_t *os = sm->sm_os;
 	spa_t *spa = dmu_objset_spa(os);
@ -435,7 +567,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
 	 */
 	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
-	    doi.doi_data_block_size != space_map_blksz) {
+	    doi.doi_data_block_size != blocksize) {
 		zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
 		    "object[%llu]: old bonus %u, old blocksz %u",
 		    dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@ -444,7 +576,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
 		space_map_free(sm, tx);
 		dmu_buf_rele(sm->sm_dbuf, sm);

-		sm->sm_object = space_map_alloc(sm->sm_os, tx);
+		sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
 		VERIFY0(space_map_open_impl(sm));
 	} else {
 		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
@ -477,7 +609,7 @@ space_map_update(space_map_t *sm)
 }

 uint64_t
-space_map_alloc(objset_t *os, dmu_tx_t *tx)
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	uint64_t object;
@ -491,8 +623,7 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx)
 		bonuslen = SPACE_MAP_SIZE_V0;
 	}

-	object = dmu_object_alloc(os,
-	    DMU_OT_SPACE_MAP, space_map_blksz,
+	object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
 	    DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);

 	return (object);
--- a/uts/common/fs/zfs/sys/dmu.h
+++ b/uts/common/fs/zfs/sys/dmu.h
@ -331,6 +331,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_REMOVING		"com.delphix:removing"
 #define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
 #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
+#define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"

 /*
 * Allocate an object from this objset.  The range of object numbers
--- a/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/uts/common/fs/zfs/sys/dsl_dir.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 */
@ -134,6 +134,7 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
    const char *name, dmu_tx_t *tx);

 uint64_t dsl_dir_get_used(dsl_dir_t *dd);
+uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
 uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
 uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
 uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
--- a/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/uts/common/fs/zfs/sys/dsl_pool.h
@ -38,6 +38,7 @@
 #include <sys/bpobj.h>
 #include <sys/bptree.h>
 #include <sys/rrwlock.h>
+#include <sys/dsl_synctask.h>

 #ifdef	__cplusplus
 extern "C" {
@ -122,6 +123,7 @@ typedef struct dsl_pool {
 	txg_list_t dp_dirty_zilogs;
 	txg_list_t dp_dirty_dirs;
 	txg_list_t dp_sync_tasks;
+	txg_list_t dp_early_sync_tasks;
 	taskq_t *dp_sync_taskq;
 	taskq_t *dp_zil_clean_taskq;

@ -144,7 +146,9 @@ dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
 void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
 int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
+uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
+    zfs_space_check_t slop_policy);
 void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
@ -155,6 +159,8 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
    int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp);
 void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
 void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
 void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
--- a/uts/common/fs/zfs/sys/dsl_synctask.h
+++ b/uts/common/fs/zfs/sys/dsl_synctask.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 */

 #ifndef	_SYS_DSL_SYNCTASK_H
@ -57,14 +57,41 @@ typedef enum zfs_space_check {
 	ZFS_SPACE_CHECK_RESERVED,

 	/*
-	 * No space check is performed.  Only operations which we expect to
-	 * result in a net reduction in space should use this
-	 * (e.g. "zfs destroy". Setting quotas & reservations also uses
-	 * this because it needs to circumvent the quota/reservation checks).
+	 * Space check allows use of three quarters of the slop space.
+	 * If there is less than 0.8% free space, the operation will
+	 * fail.
+	 */
+	ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+	/*
+	 * In all cases "zfs destroy" is expected to result in an net
+	 * reduction of space, except one. When the pool has a
+	 * checkpoint, space freed by "zfs destroy" will not actually
+	 * free anything internally. Thus, it starts failing after
+	 * three quarters of the slop space is exceeded.
+	 */
+	ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+	/*
+	 * A channel program can run a "zfs destroy" as part of its
+	 * script and therefore has the same space_check policy when
+	 * being evaluated.
+	 */
+	ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
+
+	/*
+	 * No space check is performed. This level of space check should
+	 * be used cautiously as operations that use it can even run when
+	 * 0.8% capacity is left for use. In this scenario, if there is a
+	 * checkpoint, async destroys are suspended and any kind of freeing
+	 * can potentially add space instead of freeing it.
 	 *
 	 * See also the comments above spa_slop_shift.
 	 */
 	ZFS_SPACE_CHECK_NONE,
+
+	ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
+
 } zfs_space_check_t;

 typedef struct dsl_sync_task {
@ -85,6 +112,10 @@ int dsl_sync_task(const char *, dsl_checkfunc_t *,
    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
 void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
    void *, int, zfs_space_check_t, dmu_tx_t *);
+int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
+    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+    void *, int, zfs_space_check_t, dmu_tx_t *);

 #ifdef	__cplusplus
 }
--- a/uts/common/fs/zfs/sys/metaslab.h
+++ b/uts/common/fs/zfs/sys/metaslab.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_H
@ -69,8 +69,8 @@ int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
 int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
-void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
-void metaslab_free_dva(spa_t *, const dva_t *, uint64_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
+void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
 void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
 void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
 int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
--- a/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_IMPL_H
@ -255,16 +255,16 @@ struct metaslab_group {

 /*
 * Each metaslab maintains a set of in-core trees to track metaslab
- * operations.  The in-core free tree (ms_tree) contains the list of
+ * operations.  The in-core free tree (ms_allocatable) contains the list of
 * free segments which are eligible for allocation.  As blocks are
- * allocated, the allocated segment are removed from the ms_tree and
- * added to a per txg allocation tree (ms_alloctree).  As blocks are
- * freed, they are added to the free tree (ms_freeingtree).  These trees
+ * allocated, the allocated segment are removed from the ms_allocatable and
+ * added to a per txg allocation tree (ms_allocating).  As blocks are
+ * freed, they are added to the free tree (ms_freeing).  These trees
 * allow us to process all allocations and frees in syncing context
 * where it is safe to update the on-disk space maps.  An additional set
 * of in-core trees is maintained to track deferred frees
- * (ms_defertree).  Once a block is freed it will move from the
- * ms_freedtree to the ms_defertree.  A deferred free means that a block
+ * (ms_defer).  Once a block is freed it will move from the
+ * ms_freed to the ms_defer tree.  A deferred free means that a block
 * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
 * transactions groups later.  For example, a block that is freed in txg
 * 50 will not be available for reallocation until txg 52 (50 +
@ -278,14 +278,14 @@ struct metaslab_group {
 *      ALLOCATE
 *         |
 *         V
- *    free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
+ *    free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
 *         ^
- *         |                           ms_freeingtree <--- FREE
- *         |                                 |
- *         |                                 v
- *         |                           ms_freedtree
- *         |                                 |
- *         +-------- ms_defertree[2] <-------+---------> (write to space map)
+ *         |                        ms_freeing <--- FREE
+ *         |                             |
+ *         |                             v
+ *         |                         ms_freed
+ *         |                             |
+ *         +-------- ms_defer[2] <-------+-------> (write to space map)
 *
 *
 * Each metaslab's space is tracked in a single space map in the MOS,
@ -296,8 +296,8 @@ struct metaslab_group {
 * To load the in-core free tree we read the space map from disk.  This
 * object contains a series of alloc and free records that are combined
 * to make up the list of all free segments in this metaslab.  These
- * segments are represented in-core by the ms_tree and are stored in an
- * AVL tree.
+ * segments are represented in-core by the ms_allocatable and are stored
+ * in an AVL tree.
 *
 * As the space map grows (as a result of the appends) it will
 * eventually become space-inefficient.  When the metaslab's in-core
@ -317,20 +317,22 @@ struct metaslab {
 	uint64_t	ms_size;
 	uint64_t	ms_fragmentation;

-	range_tree_t	*ms_alloctree[TXG_SIZE];
-	range_tree_t	*ms_tree;
+	range_tree_t	*ms_allocating[TXG_SIZE];
+	range_tree_t	*ms_allocatable;

 	/*
 	 * The following range trees are accessed only from syncing context.
 	 * ms_free*tree only have entries while syncing, and are empty
 	 * between syncs.
 	 */
-	range_tree_t	*ms_freeingtree; /* to free this syncing txg */
-	range_tree_t	*ms_freedtree; /* already freed this syncing txg */
-	range_tree_t	*ms_defertree[TXG_DEFER_SIZE];
+	range_tree_t	*ms_freeing;	/* to free this syncing txg */
+	range_tree_t	*ms_freed;	/* already freed this syncing txg */
+	range_tree_t	*ms_defer[TXG_DEFER_SIZE];
+	range_tree_t	*ms_checkpointing; /* to add to the checkpoint */

 	boolean_t	ms_condensing;	/* condensing? */
 	boolean_t	ms_condense_wanted;
+	uint64_t	ms_condense_checked_txg;

 	/*
 	 * We must hold both ms_lock and ms_group->mg_lock in order to
@ -356,11 +358,12 @@ struct metaslab {
 	/*
 	 * The metaslab block allocators can optionally use a size-ordered
 	 * range tree and/or an array of LBAs. Not all allocators use
-	 * this functionality. The ms_size_tree should always contain the
-	 * same number of segments as the ms_tree. The only difference
-	 * is that the ms_size_tree is ordered by segment sizes.
+	 * this functionality. The ms_allocatable_by_size should always
+	 * contain the same number of segments as the ms_allocatable. The
+	 * only difference is that the ms_allocatable_by_size is ordered by
+	 * segment sizes.
 	 */
-	avl_tree_t	ms_size_tree;
+	avl_tree_t	ms_allocatable_by_size;
 	uint64_t	ms_lbas[MAX_LBAS];

 	metaslab_group_t *ms_group;	/* metaslab group		*/
--- a/uts/common/fs/zfs/sys/range_tree.h
+++ b/uts/common/fs/zfs/sys/range_tree.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_RANGE_TREE_H
@ -82,6 +82,7 @@ range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
 uint64_t range_tree_space(range_tree_t *rt);
+boolean_t range_tree_is_empty(range_tree_t *rt);
 void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
--- a/uts/common/fs/zfs/sys/spa.h
+++ b/uts/common/fs/zfs/sys/spa.h
@ -626,6 +626,8 @@ extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
    uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
+extern int spa_checkpoint(const char *pool);
+extern int spa_checkpoint_discard(const char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
    boolean_t hardforce);
 extern int spa_reset(char *pool);
@ -788,6 +790,7 @@ extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
@ -836,6 +839,10 @@ extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
+extern boolean_t spa_has_checkpoint(spa_t *spa);
+extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
+extern boolean_t spa_suspend_async_destroy(spa_t *spa);
+extern uint64_t spa_min_claim_txg(spa_t *spa);
 extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
    const blkptr_t *bp);
@ -847,6 +854,7 @@ extern uint64_t spa_get_last_removal_txg(spa_t *spa);
 extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);

 extern int spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
--- a/uts/common/fs/zfs/sys/spa_impl.h
+++ b/uts/common/fs/zfs/sys/spa_impl.h
@ -31,6 +31,7 @@
 #define	_SYS_SPA_IMPL_H

 #include <sys/spa.h>
+#include <sys/spa_checkpoint.h>
 #include <sys/vdev.h>
 #include <sys/vdev_removal.h>
 #include <sys/metaslab.h>
@ -281,6 +282,10 @@ struct spa {
 	spa_condensing_indirect_t	*spa_condensing_indirect;
 	zthr_t		*spa_condense_zthr;	/* zthr doing condense. */

+	uint64_t	spa_checkpoint_txg;	/* the txg of the checkpoint */
+	spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
+	zthr_t		*spa_checkpoint_discard_zthr;
+
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
 	int		spa_last_open_failed;	/* error if last open failed */
--- a/uts/common/fs/zfs/sys/space_map.h
+++ b/uts/common/fs/zfs/sys/space_map.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_SPACE_MAP_H
@ -57,7 +57,7 @@ extern "C" {
 typedef struct space_map_phys {
 	uint64_t	smp_object;	/* on-disk space map object */
 	uint64_t	smp_objsize;	/* size of the object */
-	uint64_t	smp_alloc;	/* space allocated from the map */
+	int64_t		smp_alloc;	/* space allocated from the map */
 	uint64_t	smp_pad[5];	/* reserved */

 	/*
@ -82,7 +82,7 @@ typedef struct space_map {
 	uint64_t	sm_size;	/* size of map */
 	uint8_t		sm_shift;	/* unit shift */
 	uint64_t	sm_length;	/* synced length */
-	uint64_t	sm_alloc;	/* synced space allocated */
+	int64_t		sm_alloc;	/* synced space allocated */
 	objset_t	*sm_os;		/* objset for this map */
 	uint64_t	sm_object;	/* object id for this map */
 	uint32_t	sm_blksz;	/* block size for space map */
@ -140,6 +140,8 @@ typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,

 int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
 int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+    dmu_tx_t *tx);

 void space_map_histogram_clear(space_map_t *sm);
 void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
@ -153,8 +155,8 @@ uint64_t space_map_length(space_map_t *sm);

 void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
    dmu_tx_t *tx);
-void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
-uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
 void space_map_free(space_map_t *sm, dmu_tx_t *tx);
 void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);

--- a/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/uts/common/fs/zfs/sys/uberblock_impl.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_UBERBLOCK_IMPL_H
@ -60,6 +60,28 @@ struct uberblock {
 	uint64_t	ub_mmp_magic;
 	uint64_t	ub_mmp_delay;
 	uint64_t	ub_mmp_seq;
+
+	/*
+	 * ub_checkpoint_txg indicates two things about the current uberblock:
+	 *
+	 * 1] If it is not zero then this uberblock is a checkpoint. If it is
+	 *    zero, then this uberblock is not a checkpoint.
+	 *
+	 * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+	 *    the ub_txg that the uberblock had at the time we moved it to
+	 *    the MOS config.
+	 *
+	 * The field is set when we checkpoint the uberblock and continues to
+	 * hold that value even after we've rewound (unlike the ub_txg that
+	 * is reset to a higher value).
+	 *
+	 * Besides checks used to determine whether we are reopening the
+	 * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+	 * the value of the field is used to determine which ZIL blocks have
+	 * been allocated according to the ms_sm when we are rewinding to a
+	 * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+	 * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+	 */
 	uint64_t	ub_checkpoint_txg;
 };

--- a/uts/common/fs/zfs/sys/vdev.h
+++ b/uts/common/fs/zfs/sys/vdev.h
@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_VDEV_H
@ -81,7 +81,7 @@ extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
-    uint64_t size, uint64_t txg);
+    uint64_t size);
 extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
    uint64_t offset, uint64_t size, dmu_tx_t *tx);

@ -121,6 +121,7 @@ extern boolean_t vdev_readable(vdev_t *vd);
 extern boolean_t vdev_writeable(vdev_t *vd);
 extern boolean_t vdev_allocatable(vdev_t *vd);
 extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
+extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);

 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
--- a/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_VDEV_IMPL_H
@ -228,6 +228,9 @@ struct vdev {
 	kmutex_t	vdev_queue_lock; /* protects vdev_queue_depth	*/
 	uint64_t	vdev_top_zap;

+	/* pool checkpoint related */
+	space_map_t	*vdev_checkpoint_sm;	/* contains reserved blocks */
+
 	/*
 	 * Values stored in the config for an indirect or removing vdev.
 	 */
@ -437,6 +440,7 @@ extern void vdev_set_min_asize(vdev_t *vd);
 /*
 * Global variables
 */
+extern int vdev_standard_sm_blksz;
 /* zdb uses this tunable, so it must be declared here to make lint happy. */
 extern int zfs_vdev_cache_size;

@ -449,6 +453,11 @@ extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
 extern int vdev_obsolete_sm_object(vdev_t *vd);
 extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);

+/*
+ * Other miscellaneous functions
+ */
+int vdev_checkpoint_sm_object(vdev_t *vd);
+
 /*
 * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
 */
--- a/uts/common/fs/zfs/sys/vdev_removal.h
+++ b/uts/common/fs/zfs/sys/vdev_removal.h
@ -14,7 +14,7 @@
 */

 /*
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 */

 #ifndef _SYS_VDEV_REMOVAL_H
@ -79,7 +79,7 @@ extern void spa_condense_fini(spa_t *);
 extern void spa_start_indirect_condensing_thread(spa_t *);
 extern void spa_vdev_condense_suspend(spa_t *);
 extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
-extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
 extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
 extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
 extern void spa_vdev_remove_suspend(spa_t *);
--- a/uts/common/fs/zfs/sys/zio.h
+++ b/uts/common/fs/zfs/sys/zio.h
@ -517,7 +517,6 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,

 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
-extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);

--- a/uts/common/fs/zfs/sys/zthr.h
+++ b/uts/common/fs/zfs/sys/zthr.h
@ -13,7 +13,6 @@
 * CDDL HEADER END
 */

-
 /*
 * Copyright (c) 2017 by Delphix. All rights reserved.
 */
--- a/uts/common/fs/zfs/uberblock.c
+++ b/uts/common/fs/zfs/uberblock.c
@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -57,6 +57,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
 	ub->ub_guid_sum = rvd->vdev_guid_sum;
 	ub->ub_timestamp = gethrestime_sec();
 	ub->ub_software_version = SPA_VERSION;
+	ub->ub_checkpoint_txg = 0;

 	return (ub->ub_rootbp.blk_birth == txg);
 }
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@ -71,14 +71,30 @@ static vdev_ops_t *vdev_ops_table[] = {
 /* maximum scrub/resilver I/O queue per leaf vdev */
 int zfs_scrub_limit = 10;

-/*
- * When a vdev is added, it will be divided into approximately (but no
- * more than) this number of metaslabs.
- */
-int metaslabs_per_vdev = 200;
+/* maximum number of metaslabs per top-level vdev */
+int vdev_max_ms_count = 200;
+
+/* minimum amount of metaslabs per top-level vdev */
+int vdev_min_ms_count = 16;
+
+/* see comment in vdev_metaslab_set_size() */
+int vdev_default_ms_shift = 29;

 boolean_t vdev_validate_skip = B_FALSE;

+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+int vdev_dtl_sm_blksz = (1 << 12);
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int vdev_standard_sm_blksz = (1 << 17);
+
 /*PRINTFLIKE2*/
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@ -839,6 +855,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;

+	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+	svd->vdev_checkpoint_sm = NULL;
+
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@ -1043,6 +1062,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
+	if (vd->vdev_checkpoint_sm != NULL) {
+		ASSERT(spa_feature_is_active(vd->vdev_spa,
+		    SPA_FEATURE_POOL_CHECKPOINT));
+		space_map_close(vd->vdev_checkpoint_sm);
+		/*
+		 * Even though we close the space map, we need to set its
+		 * pointer to NULL. The reason is that vdev_metaslab_fini()
+		 * may be called multiple times for certain operations
+		 * (i.e. when destroying a pool) so we need to ensure that
+		 * this clause never executes twice. This logic is similar
+		 * to the one used for the vdev_ms clause below.
+		 */
+		vd->vdev_checkpoint_sm = NULL;
+	}
+
 	if (vd->vdev_ms != NULL) {
 		uint64_t count = vd->vdev_ms_count;

@ -1940,11 +1974,39 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
+	uint64_t asize = vd->vdev_asize;
+	uint64_t ms_shift = 0;
+
 	/*
-	 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
+	 * For vdevs that are bigger than 8G the metaslab size varies in
+	 * a way that the number of metaslabs increases in powers of two,
+	 * linearly in terms of vdev_asize, starting from 16 metaslabs.
+	 * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
+	 * and so on, until we hit the maximum metaslab count limit
+	 * [vdev_max_ms_count] from which point the metaslab count stays
+	 * the same.
 	 */
-	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
-	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+	ms_shift = vdev_default_ms_shift;
+
+	if ((asize >> ms_shift) < vdev_min_ms_count) {
+		/*
+		 * For devices that are less than 8G we want to have
+		 * exactly 16 metaslabs. We don't want less as integer
+		 * division rounds down, so less metaslabs mean more
+		 * wasted space. We don't want more as these vdevs are
+		 * small and in the likely event that we are running
+		 * out of space, the SPA will have a hard time finding
+		 * space due to fragmentation.
+		 */
+		ms_shift = highbit64(asize / vdev_min_ms_count);
+		ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
+
+	} else if ((asize >> ms_shift) > vdev_max_ms_count) {
+		ms_shift = highbit64(asize / vdev_max_ms_count);
+	}
+
+	vd->vdev_ms_shift = ms_shift;
+	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }

 void
@ -2049,7 +2111,7 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 		return (B_FALSE);

 	mutex_enter(&vd->vdev_dtl_lock);
-	if (range_tree_space(rt) != 0)
+	if (!range_tree_is_empty(rt))
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);

@ -2063,7 +2125,7 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 	boolean_t empty;

 	mutex_enter(&vd->vdev_dtl_lock);
-	empty = (range_tree_space(rt) == 0);
+	empty = range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);

 	return (empty);
@ -2122,7 +2184,7 @@ vdev_dtl_should_excise(vdev_t *vd)
 		return (B_FALSE);

 	if (vd->vdev_resilver_txg == 0 ||
-	    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+	    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);

 	/*
@ -2219,8 +2281,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		 * DTLs then reset its resilvering flag.
 		 */
 		if (vd->vdev_resilver_txg != 0 &&
-		    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
-		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
+		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE]))
 			vd->vdev_resilver_txg = 0;

 		mutex_exit(&vd->vdev_dtl_lock);
@ -2378,7 +2440,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;

-		new_object = space_map_alloc(mos, tx);
+		new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);

 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
@ -2392,7 +2454,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);

-	space_map_truncate(vd->vdev_dtl_sm, tx);
+	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
 	range_tree_vacate(rtsync, NULL, NULL);

@ -2463,7 +2525,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)

 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
+		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {

 			thismin = vdev_dtl_min(vd);
@ -2491,6 +2553,28 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 	return (needed);
 }

+/*
+ * Gets the checkpoint space map object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP
+ * or the ZAP doesn't exist yet.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd)
+{
+	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+	if (vd->vdev_top_zap == 0) {
+		return (0);
+	}
+
+	uint64_t sm_obj = 0;
+	int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
+
+	ASSERT(err == 0 || err == ENOENT);
+
+	return (sm_obj);
+}
+
 int
 vdev_load(vdev_t *vd)
 {
@ -2525,6 +2609,35 @@ vdev_load(vdev_t *vd)
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
+
+		uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
+		if (checkpoint_sm_obj != 0) {
+			objset_t *mos = spa_meta_objset(vd->vdev_spa);
+			ASSERT(vd->vdev_asize != 0);
+			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+			if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+			    vd->vdev_ashift))) {
+				vdev_dbgmsg(vd, "vdev_load: space_map_open "
+				    "failed for checkpoint spacemap (obj %llu) "
+				    "[error=%d]",
+				    (u_longlong_t)checkpoint_sm_obj, error);
+				return (error);
+			}
+			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+			space_map_update(vd->vdev_checkpoint_sm);
+
+			/*
+			 * Since the checkpoint_sm contains free entries
+			 * exclusively we can use sm_alloc to indicate the
+			 * culmulative checkpointed space that has been freed.
+			 */
+			vd->vdev_stat.vs_checkpoint_space =
+			    -vd->vdev_checkpoint_sm->sm_alloc;
+			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+			    vd->vdev_stat.vs_checkpoint_space;
+		}
 	}

 	/*
@ -2542,7 +2655,7 @@ vdev_load(vdev_t *vd)
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
-		ASSERT(vd->vdev_obsolete_sm == NULL);
+		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);

 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
@ -2668,6 +2781,12 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg)
 			mutex_exit(&msp->ms_lock);
 		}

+		if (vd->vdev_checkpoint_sm != NULL) {
+			ASSERT(spa_has_checkpoint(spa));
+			space_map_close(vd->vdev_checkpoint_sm);
+			vd->vdev_checkpoint_sm = NULL;
+		}
+
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
@ -2975,6 +3094,17 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)

 			error = spa_reset_logs(spa);

+			/*
+			 * If the log device was successfully reset but has
+			 * checkpointed data, do not offline it.
+			 */
+			if (error == 0 &&
+			    tvd->vdev_checkpoint_sm != NULL) {
+				ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
+				    !=, 0);
+				error = ZFS_ERR_CHECKPOINT_EXISTS;
+			}
+
 			spa_vdev_state_enter(spa, SCL_ALLOC);

 			/*
@ -3167,6 +3297,23 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 	return (B_TRUE);
 }

+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+	/*
+	 * Assuming 47 bits of the space map entry dedicated for the entry's
+	 * offset (see description in space_map.h), we calculate the maximum
+	 * address that can be described by a space map entry for the given
+	 * device.
+	 */
+	uint64_t shift = vd->vdev_ashift + 47;
+
+	if (shift >= 63) /* detect potential overflow */
+		return (B_TRUE);
+
+	return (vd->vdev_asize < (1ULL << shift));
+}
+
 /*
 * Get statistics for the given vdev.
 */
--- a/uts/common/fs/zfs/vdev_indirect.c
+++ b/uts/common/fs/zfs/vdev_indirect.c
@ -204,14 +204,13 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 int zfs_condense_indirect_commit_entry_delay_ticks = 0;

 /*
- * Mark the given offset and size as being obsolete in the given txg.
+ * Mark the given offset and size as being obsolete.
 */
 void
-vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
-	ASSERT3U(spa_syncing_txg(spa), ==, txg);
+
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
 	ASSERT(size > 0);
@ -222,7 +221,7 @@ vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
 		mutex_enter(&vd->vdev_obsolete_lock);
 		range_tree_add(vd->vdev_obsolete_segments, offset, size);
 		mutex_exit(&vd->vdev_obsolete_lock);
-		vdev_dirty(vd, 0, NULL, txg);
+		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
 	}
 }

@ -240,7 +239,7 @@ spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,

 	/* The DMU can only remap indirect vdevs. */
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-	vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
+	vdev_indirect_mark_obsolete(vd, offset, size);
 }

 static spa_condensing_indirect_t *
@ -630,7 +629,8 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
 		return (0);

 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
-	    spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
+	    spa_condense_indirect_complete_sync, sci, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));

 	return (0);
 }
@ -707,7 +707,8 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)

 	if (vdev_obsolete_sm_object(vd) == 0) {
 		uint64_t obsolete_sm_object =
-		    space_map_alloc(spa->spa_meta_objset, tx);
+		    space_map_alloc(spa->spa_meta_objset,
+		    vdev_standard_sm_blksz, tx);

 		ASSERT(vd->vdev_top_zap != 0);
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
--- a/uts/common/fs/zfs/vdev_label.c
+++ b/uts/common/fs/zfs/vdev_label.c
@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 */

 /*
@ -209,6 +209,37 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
 	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 }

+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	if (vd != spa->spa_root_vdev)
+		return;
+
+	/* provide either current or previous scan information */
+	pool_scan_stat_t ps;
+	if (spa_scan_get_stats(spa, &ps) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+	}
+
+	pool_removal_stat_t prs;
+	if (spa_removal_get_stats(spa, &prs) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+		    sizeof (prs) / sizeof (uint64_t));
+	}
+
+	pool_checkpoint_stat_t pcs;
+	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+		    sizeof (pcs) / sizeof (uint64_t));
+	}
+}
+
 /*
 * Generate the nvlist representing this vdev's config.
 */
@ -331,20 +362,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));

-		/* provide either current or previous scan information */
-		pool_scan_stat_t ps;
-		if (spa_scan_get_stats(spa, &ps) == 0) {
-			fnvlist_add_uint64_array(nv,
-			    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
-			    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
-		}
-
-		pool_removal_stat_t prs;
-		if (spa_removal_get_stats(spa, &prs) == 0) {
-			fnvlist_add_uint64_array(nv,
-			    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
-			    sizeof (prs) / sizeof (uint64_t));
-		}
+		root_vdev_actions_getprogress(vd, nv);

 		/*
 		 * Note: this can be called from open context
@ -1291,11 +1309,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
-	vdev_t *vd;
-	zio_t *zio;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;

+	ASSERT(svdcount != 0);
 retry:
 	/*
 	 * Normally, we don't want to try too hard to write every label and
@ -1334,9 +1351,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	 * written in this txg will be committed to stable storage
 	 * before any uberblock that references them.
 	 */
-	zio = zio_root(spa, NULL, NULL, flags);
+	zio_t *zio = zio_root(spa, NULL, NULL, flags);

-	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+	for (vdev_t *vd =
+	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
 		zio_flush(zio, vd);

@ -1351,8 +1369,14 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	 * the new labels to disk to ensure that all even-label updates
 	 * are committed to stable storage before the uberblock update.
 	 */
-	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
+	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+			    "for pool '%s' when syncing out the even labels "
+			    "of dirty vdevs", error, spa_name(spa));
+		}
 		goto retry;
+	}

 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
@ -1369,8 +1393,13 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
 	 */
-	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
+	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+			    "%d for pool '%s'", error, spa_name(spa));
+		}
 		goto retry;
+	}

 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
@ -1382,8 +1411,14 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	 * to disk to ensure that all odd-label updates are committed to
 	 * stable storage before the next transaction group begins.
 	 */
-	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+			    "for pool '%s' when syncing out the odd labels of "
+			    "dirty vdevs", error, spa_name(spa));
+		}
 		goto retry;
+	}

 	return (0);
 }
--- a/uts/common/fs/zfs/vdev_removal.c
+++ b/uts/common/fs/zfs/vdev_removal.c
@ -110,6 +110,12 @@ int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
 */
 int zfs_remove_max_segment = 1024 * 1024;

+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+uint64_t zfs_remove_max_bytes_pause = UINT64_MAX;
+
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"

 static void spa_vdev_remove_thread(void *arg);
@ -278,11 +284,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 		 * be copied.
 		 */
 		spa->spa_removing_phys.sr_to_copy -=
-		    range_tree_space(ms->ms_freeingtree);
+		    range_tree_space(ms->ms_freeing);

-		ASSERT0(range_tree_space(ms->ms_freedtree));
+		ASSERT0(range_tree_space(ms->ms_freed));
 		for (int t = 0; t < TXG_SIZE; t++)
-			ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+			ASSERT0(range_tree_space(ms->ms_allocating[t]));
 	}

 	/*
@ -462,19 +468,18 @@ spa_restart_removal(spa_t *spa)
 * and we correctly free already-copied data.
 */
 void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t txg = spa_syncing_txg(spa);
 	uint64_t max_offset_yet = 0;

 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
 	ASSERT3P(vd, ==, svr->svr_vdev);
-	ASSERT3U(spa_syncing_txg(spa), ==, txg);

 	mutex_enter(&svr->svr_lock);

@ -489,8 +494,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
 	 * held, so that the remove_thread can not load this metaslab and then
 	 * visit this offset between the time that we metaslab_free_concrete()
 	 * and when we check to see if it has been visited.
+	 *
+	 * Note: The checkpoint flag is set to false as having/taking
+	 * a checkpoint and removing a device can't happen at the same
+	 * time.
 	 */
-	metaslab_free_concrete(vd, offset, size, txg);
+	ASSERT(!spa_has_checkpoint(spa));
+	metaslab_free_concrete(vd, offset, size, B_FALSE);

 	uint64_t synced_size = 0;
 	uint64_t synced_offset = 0;
@ -622,16 +632,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
 	 * of this free.
 	 */
 	if (synced_size > 0) {
-		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-		    txg);
+		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
 		/*
 		 * Note: this can only be called from syncing context,
 		 * and the vdev_indirect_mapping is only changed from the
 		 * sync thread, so we don't need svr_lock while doing
 		 * metaslab_free_impl_cb.
 		 */
+		boolean_t checkpoint = B_FALSE;
 		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-		    metaslab_free_impl_cb, &txg);
+		    metaslab_free_impl_cb, &checkpoint);
 	}
 }

@ -678,10 +689,10 @@ static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
-	vdev_indirect_mark_obsolete(vd, offset, size,
-	    vd->vdev_spa->spa_syncing_txg);
+	vdev_indirect_mark_obsolete(vd, offset, size);
+	boolean_t checkpoint = B_FALSE;
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-	    metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+	    metaslab_free_impl_cb, &checkpoint);
 }

 /*
@ -1198,7 +1209,7 @@ spa_vdev_remove_thread(void *arg)
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++) {
-			ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+			ASSERT0(range_tree_space(msp->ms_allocating[i]));
 		}

 		/*
@ -1228,7 +1239,7 @@ spa_vdev_remove_thread(void *arg)
 			    SM_ALLOC));
 			space_map_close(sm);

-			range_tree_walk(msp->ms_freeingtree,
+			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);

 			/*
@ -1247,10 +1258,23 @@ spa_vdev_remove_thread(void *arg)
 		    msp->ms_id);

 		while (!svr->svr_thread_exit &&
-		    range_tree_space(svr->svr_allocd_segs) != 0) {
+		    !range_tree_is_empty(svr->svr_allocd_segs)) {

 			mutex_exit(&svr->svr_lock);

+			/*
+			 * This delay will pause the removal around the point
+			 * specified by zfs_remove_max_bytes_pause. We do this
+			 * solely from the test suite or during debugging.
+			 */
+			uint64_t bytes_copied =
+			    spa->spa_removing_phys.sr_copied;
+			for (int i = 0; i < TXG_SIZE; i++)
+				bytes_copied += svr->svr_bytes_done[i];
+			while (zfs_remove_max_bytes_pause <= bytes_copied &&
+			    !svr->svr_thread_exit)
+				delay(hz);
+
 			mutex_enter(&vca.vca_lock);
 			while (vca.vca_outstanding_bytes >
 			    zfs_remove_max_copy_bytes) {
@ -1380,10 +1404,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++)
-			ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+			ASSERT0(range_tree_space(msp->ms_allocating[i]));
 		for (int i = 0; i < TXG_DEFER_SIZE; i++)
-			ASSERT0(range_tree_space(msp->ms_defertree[i]));
-		ASSERT0(range_tree_space(msp->ms_freedtree));
+			ASSERT0(range_tree_space(msp->ms_defer[i]));
+		ASSERT0(range_tree_space(msp->ms_freed));

 		if (msp->ms_sm != NULL) {
 			/*
@ -1399,7 +1423,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 			mutex_enter(&svr->svr_lock);
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
-			range_tree_walk(msp->ms_freeingtree,
+			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);

 			/*
@ -1472,7 +1496,8 @@ spa_vdev_remove_cancel(spa_t *spa)
 	uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;

 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-	    spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+	    spa_vdev_remove_cancel_sync, NULL, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);

 	if (error == 0) {
 		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@ -1810,6 +1835,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 	if (!locked)
 		txg = spa_vdev_enter(spa);

+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+		if (!locked)
+			return (spa_vdev_exit(spa, NULL, txg, error));
+
+		return (error);
+	}
+
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);

 	if (spa->spa_spares.sav_vdevs != NULL &&
--- a/uts/common/fs/zfs/zcp.c
+++ b/uts/common/fs/zfs/zcp.c
@ -1133,7 +1133,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,

 	if (sync) {
 		err = dsl_sync_task(poolname, NULL,
-		    zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE);
+		    zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
 		if (err != 0)
 			zcp_pool_error(&evalargs, poolname);
 	} else {
--- a/uts/common/fs/zfs/zcp_synctask.c
+++ b/uts/common/fs/zfs/zcp_synctask.c
@ -110,7 +110,7 @@ static zcp_synctask_info_t zcp_synctask_destroy_info = {
 	    {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
 	    {NULL, NULL}
 	},
-	.space_check = ZFS_SPACE_CHECK_NONE,
+	.space_check = ZFS_SPACE_CHECK_DESTROY,
 	.blocks_modified = 0
 };

@ -302,10 +302,9 @@ zcp_synctask_wrapper(lua_State *state)
 	zcp_parse_args(state, info->name, info->pargs, info->kwargs);

 	err = 0;
-	if (info->space_check != ZFS_SPACE_CHECK_NONE && funcspace > 0) {
-		uint64_t quota = dsl_pool_adjustedsize(dp,
-		    info->space_check == ZFS_SPACE_CHECK_RESERVED) -
-		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+	if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+		uint64_t quota = dsl_pool_unreserved_space(dp,
+		    info->space_check);
 		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
 		    ri->zri_space_used;

--- a/uts/common/fs/zfs/zfs_ioctl.c
+++ b/uts/common/fs/zfs/zfs_ioctl.c
@ -3658,6 +3658,29 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
 	    nvarg, outnvl));
 }

+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	return (spa_checkpoint_discard(poolname));
+}
+
 /*
 * inputs:
 * zc_name		name of dataset to destroy
@ -5837,6 +5860,15 @@ zfs_ioctl_init(void)
 	    POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
 	    B_TRUE);

+	zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+	    zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("zpool_discard_checkpoint",
+	    ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+	    zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
 	/* IOCTLS that use the legacy function signature */

 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
--- a/uts/common/fs/zfs/zil.c
+++ b/uts/common/fs/zfs/zil.c
@ -28,6 +28,7 @@

 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
@ -393,6 +394,35 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 	return (error);
 }

+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+	ASSERT(!BP_IS_HOLE(bp));
+
+	/*
+	 * As we call this function from the context of a rewind to a
+	 * checkpoint, each ZIL block whose txg is later than the txg
+	 * that we rewind to is invalid. Thus, we return -1 so
+	 * zil_parse() doesn't attempt to read it.
+	 */
+	if (bp->blk_birth >= first_txg)
+		return (-1);
+
+	if (zil_bp_tree_add(zilog, bp) != 0)
+		return (0);
+
+	zio_free(zilog->zl_spa, first_txg, bp);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+	return (0);
+}
+
 static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
@ -436,7 +466,7 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
-	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);

 	return (0);
 }
@ -622,7 +652,7 @@ zil_create(zilog_t *zilog)
 		txg = dmu_tx_get_txg(tx);

 		if (!BP_IS_HOLE(&blk)) {
-			zio_free_zil(zilog->zl_spa, txg, &blk);
+			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}

@ -722,8 +752,8 @@ int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
-	uint64_t first_txg = dmu_tx_get_txg(tx);
 	zilog_t *zilog;
+	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
@ -744,16 +774,55 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)

 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
+	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+	first_txg = spa_min_claim_txg(zilog->zl_spa);

-	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
-		if (!BP_IS_HOLE(&zh->zh_log))
-			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
+	/*
+	 * If the spa_log_state is not set to be cleared, check whether
+	 * the current uberblock is a checkpoint one and if the current
+	 * header has been claimed before moving on.
+	 *
+	 * If the current uberblock is a checkpointed uberblock then
+	 * one of the following scenarios took place:
+	 *
+	 * 1] We are currently rewinding to the checkpoint of the pool.
+	 * 2] We crashed in the middle of a checkpoint rewind but we
+	 *    did manage to write the checkpointed uberblock to the
+	 *    vdev labels, so when we tried to import the pool again
+	 *    the checkpointed uberblock was selected from the import
+	 *    procedure.
+	 *
+	 * In both cases we want to zero out all the ZIL blocks, except
+	 * the ones that have been claimed at the time of the checkpoint
+	 * (their zh_claim_txg != 0). The reason is that these blocks
+	 * may be corrupted since we may have reused their locations on
+	 * disk after we took the checkpoint.
+	 *
+	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+	 * when we first figure out whether the current uberblock is
+	 * checkpointed or not. Unfortunately, that would discard all
+	 * the logs, including the ones that are claimed, and we would
+	 * leak space.
+	 */
+	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+	    zh->zh_claim_txg == 0)) {
+		if (!BP_IS_HOLE(&zh->zh_log)) {
+			(void) zil_parse(zilog, zil_clear_log_block,
+			    zil_noop_log_record, tx, first_txg);
+		}
 		BP_ZERO(&zh->zh_log);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, FTAG);
 		return (0);
 	}

+	/*
+	 * If we are not rewinding and opening the pool normally, then
+	 * the min_claim_txg should be equal to the first txg of the pool.
+	 */
+	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
@ -805,16 +874,17 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;

-	/*
-	 * Check the first block and determine if it's on a log device
-	 * which may have been removed or faulted prior to loading this
-	 * pool.  If so, there's no point in checking the rest of the log
-	 * as its content should have already been synced to the pool.
-	 */
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;

+		/*
+		 * Check the first block and determine if it's on a log device
+		 * which may have been removed or faulted prior to loading this
+		 * pool.  If so, there's no point in checking the rest of the
+		 * log as its content should have already been synced to the
+		 * pool.
+		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
@ -823,6 +893,18 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)

 		if (!valid)
 			return (0);
+
+		/*
+		 * Check whether the current uberblock is checkpointed (e.g.
+		 * we are rewinding) and whether the current header has been
+		 * claimed or not. If it hasn't then skip verifying it. We
+		 * do this because its ZIL blocks may be part of the pool's
+		 * state before the rewind, which is no longer valid.
+		 */
+		zil_header_t *zh = zil_header_in_syncing_context(zilog);
+		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+		    zh->zh_claim_txg == 0)
+			return (0);
 	}

 	/*
@ -833,7 +915,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
-	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+	    zilog->zl_header->zh_claim_txg ? -1ULL :
+	    spa_min_claim_txg(os->os_spa));

 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
--- a/uts/common/fs/zfs/zio.c
+++ b/uts/common/fs/zfs/zio.c
@ -956,8 +956,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-	ASSERT(txg == spa_first_txg(spa) || txg == 0);
+	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	    spa_min_claim_txg(spa));
+	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */

 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@ -3027,18 +3028,6 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
 	return (error);
 }

-/*
- * Free an intent log block.
- */
-void
-zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
-{
-	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
-	ASSERT(!BP_IS_GANG(bp));
-
-	zio_free(spa, txg, bp);
-}
-
 /*
 * ==========================================================================
 * Read and write to physical devices
--- a/uts/common/fs/zfs/zthr.c
+++ b/uts/common/fs/zfs/zthr.c
@ -235,8 +235,6 @@ zthr_destroy(zthr_t *t)
 void
 zthr_wakeup(zthr_t *t)
 {
-	ASSERT3P(t->zthr_thread, !=, NULL);
-
 	mutex_enter(&t->zthr_lock);
 	cv_broadcast(&t->zthr_cv);
 	mutex_exit(&t->zthr_lock);
--- a/uts/common/sys/fs/zfs.h
+++ b/uts/common/sys/fs/zfs.h
@ -209,6 +209,7 @@ typedef enum {
 	ZPOOL_PROP_LEAKED,
 	ZPOOL_PROP_MAXBLOCKSIZE,
 	ZPOOL_PROP_BOOTSIZE,
+	ZPOOL_PROP_CHECKPOINT,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;

@ -536,6 +537,7 @@ typedef struct zpool_rewind_policy {
 #define	ZPOOL_CONFIG_DTL		"DTL"
 #define	ZPOOL_CONFIG_SCAN_STATS		"scan_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_REMOVAL_STATS	"removal_stats"	/* not stored on disk */
+#define	ZPOOL_CONFIG_CHECKPOINT_STATS	"checkpoint_stats" /* not on disk */
 #define	ZPOOL_CONFIG_VDEV_STATS		"vdev_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_INDIRECT_SIZE	"indirect_size"	/* not stored on disk */
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
@ -621,6 +623,8 @@ typedef struct zpool_rewind_policy {
 	"com.delphix:indirect_obsolete_sm"
 #define	VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
 	"com.delphix:obsolete_counts_are_precise"
+#define	VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
+	"com.delphix:pool_checkpoint_sm"

 /*
 * This is needed in userland to report the minimum necessary device size.
@ -779,6 +783,18 @@ typedef enum dsl_scan_state {
 	DSS_NUM_STATES
 } dsl_scan_state_t;

+typedef enum {
+	CS_NONE,
+	CS_CHECKPOINT_EXISTS,
+	CS_CHECKPOINT_DISCARDING,
+	CS_NUM_STATES
+} checkpoint_state_t;
+
+typedef struct pool_checkpoint_stat {
+	uint64_t pcs_state;		/* checkpoint_state_t */
+	uint64_t pcs_start_time;	/* time checkpoint/discard started */
+	uint64_t pcs_space;		/* checkpointed space */
+} pool_checkpoint_stat_t;

 /*
 * Vdev statistics.  Note: all fields should be 64-bit because this
@ -802,6 +818,7 @@ typedef struct vdev_stat {
 	uint64_t	vs_scan_removing;	/* removing?	*/
 	uint64_t	vs_scan_processed;	/* scan processed bytes	*/
 	uint64_t	vs_fragmentation;	/* device fragmentation */
+	uint64_t	vs_checkpoint_space;    /* checkpoint-consumed space */
 } vdev_stat_t;

 /*
@ -926,9 +943,27 @@ typedef enum zfs_ioc {
 	ZFS_IOC_DESTROY_BOOKMARKS,
 	ZFS_IOC_CHANNEL_PROGRAM,
 	ZFS_IOC_REMAP,
+	ZFS_IOC_POOL_CHECKPOINT,
+	ZFS_IOC_POOL_DISCARD_CHECKPOINT,
 	ZFS_IOC_LAST
 } zfs_ioc_t;

+/*
+ * ZFS-specific error codes used for returning descriptive errors
+ * to the userland through zfs ioctls.
+ *
+ * The enum implicitly includes all the error codes from errno.h.
+ * New code should use and extend this enum for errors that are
+ * not described precisely by generic errno codes.
+ */
+typedef enum {
+	ZFS_ERR_CHECKPOINT_EXISTS = 1024,
+	ZFS_ERR_DISCARDING_CHECKPOINT,
+	ZFS_ERR_NO_CHECKPOINT,
+	ZFS_ERR_DEVRM_IN_PROGRESS,
+	ZFS_ERR_VDEV_TOO_BIG
+} zfs_errno_t;
+
 /*
 * Internal SPA load state.  Used by FMA diagnosis engine.
 */
@ -989,6 +1024,7 @@ typedef enum {
 #define	ZFS_IMPORT_ANY_HOST	0x2
 #define	ZFS_IMPORT_MISSING_LOG	0x4
 #define	ZFS_IMPORT_ONLY		0x8
+#define	ZFS_IMPORT_CHECKPOINT	0x10

 /*
 * Channel program argument/return nvlist keys and defaults.