MFV r354383: 10592 misc. metaslab and vdev related ZoL bug fixes

illumos/illumos-gate@555d674d5d 555d674d5d https://www.illumos.org/issues/10592 This is a collection of recent fixes from ZoL: 8eef997679 Error path in metaslab_load_impl() forgets to drop ms_sync_lock 928e8ad47d Introduce auxiliary metaslab histograms 425d3237ee Get rid of space_map_update() for ms_synced_length 6c926f426a Simplify log vdev removal code 21e7cf5da8 zdb -L should skip leak detection altogether df72b8bebe Rename range_tree_verify to range_tree_verify_not_present 75058f3303 Remove unused vdev_t fields Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Author: Serapheim Dimitropoulos <serapheim@delphix.com> MFC after: 4 weeks
svn path=/head/; revision=354948
2024-12-03 09:00:21 +00:00 · 2019-11-21 13:35:43 +00:00 · 2019-11-21 13:35:43 +00:00 · 8491540808 · 2020-12-20 02:59:44 +00:00
commit 8491540808
parent 68cad68149 4ca571aba0
16 changed files with 772 additions and 447 deletions
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.8
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.8
@ -10,7 +10,7 @@
 .\"
 .\"
 .\" Copyright 2012, Richard Lowe.
-.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\"
 .Dd October 06, 2017
@ -187,7 +187,7 @@ If the
 .Fl u
 option is also specified, also display the uberblocks on this device.
 .It Fl L
-Disable leak tracing and the loading of space maps.
+Disable leak detection and the loading of space maps.
 By default,
 .Nm
 verifies that all non-free blocks are referenced, which can be very expensive.
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 		return;

 	(void) printf("space map object %llu:\n",
-	    (longlong_t)sm->sm_phys->smp_object);
-	(void) printf("  smp_objsize = 0x%llx\n",
-	    (longlong_t)sm->sm_phys->smp_objsize);
+	    (longlong_t)sm->sm_object);
+	(void) printf("  smp_length = 0x%llx\n",
+	    (longlong_t)sm->sm_phys->smp_length);
 	(void) printf("  smp_alloc = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_alloc);

+	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+		return;
+
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	uint8_t mapshift = sm->sm_shift;
 	int64_t alloc = 0;
-	uint64_t word;
+	uint64_t word, entry_id = 0;
 	for (uint64_t offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (word)) {

@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 		    sizeof (word), &word, DMU_READ_PREFETCH));

 		if (sm_entry_is_debug(word)) {
-			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
-			    (u_longlong_t)(offset / sizeof (word)),
+			(void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
+			    (u_longlong_t)entry_id,
 			    ddata[SM_DEBUG_ACTION_DECODE(word)],
 			    (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
 			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+			entry_id++;
 			continue;
 		}

@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)

 		(void) printf("\t    [%6llu]    %c  range:"
 		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
-		    (u_longlong_t)(offset / sizeof (word)),
+		    (u_longlong_t)entry_id,
 		    entry_type, (u_longlong_t)entry_off,
 		    (u_longlong_t)(entry_off + entry_run),
 		    (u_longlong_t)entry_run,
@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 			alloc += entry_run;
 		else
 			alloc -= entry_run;
+		entry_id++;
 	}
-	if ((uint64_t)alloc != space_map_allocated(sm)) {
+	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
 		    "with space map summary (%lld)\n",
 		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp)
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}

-	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
-		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
-
-		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
-	}
+	ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 }

 static void
@ -3140,6 +3142,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 	ddt_entry_t dde;
 	int error;

+	ASSERT(!dump_opt['L']);
+
 	bzero(&ddb, sizeof (ddb));
 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
 		blkptr_t blk;
@ -3163,12 +3167,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 				zcb->zcb_dedup_blocks++;
 			}
 		}
-		if (!dump_opt['L']) {
-			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
-			ddt_enter(ddt);
-			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
-			ddt_exit(ddt);
-		}
+		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+		ddt_enter(ddt);
+		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+		ddt_exit(ddt);
 	}

 	ASSERT(error == ENOENT);
@ -3210,6 +3212,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
+	if (dump_opt['L'])
+		return;
+
 	if (spa->spa_vdev_removal == NULL)
 		return;

@ -3301,7 +3306,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
 		space_map_t *prev_obsolete_sm = NULL;
 		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
-		space_map_update(prev_obsolete_sm);
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    prev_obsolete_sm);
 		space_map_close(prev_obsolete_sm);
@ -3395,9 +3399,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)

 	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-	space_map_update(checkpoint_sm);

 	VERIFY0(space_map_iterate(checkpoint_sm,
+	    space_map_length(checkpoint_sm),
 	    checkpoint_sm_exclude_entry_cb, &cseea));
 	space_map_close(checkpoint_sm);

@ -3407,6 +3411,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
+	ASSERT(!dump_opt['L']);
+
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
@ -3503,6 +3509,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
+	ASSERT(!dump_opt['L']);
+
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
@ -3549,67 +3557,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;

-	if (!dump_opt['L']) {
-		dsl_pool_t *dp = spa->spa_dsl_pool;
-		vdev_t *rvd = spa->spa_root_vdev;
+	if (dump_opt['L'])
+		return;

-		/*
-		 * We are going to be changing the meaning of the metaslab's
-		 * ms_allocatable.  Ensure that the allocator doesn't try to
-		 * use the tree.
-		 */
-		spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
-		spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	vdev_t *rvd = spa->spa_root_vdev;

-		zcb->zcb_vd_obsolete_counts =
-		    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
-		    UMEM_NOFAIL);
+	/*
+	 * We are going to be changing the meaning of the metaslab's
+	 * ms_allocatable.  Ensure that the allocator doesn't try to
+	 * use the tree.
+	 */
+	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;

-		/*
-		 * For leak detection, we overload the ms_allocatable trees
-		 * to contain allocated segments instead of free segments.
-		 * As a result, we can't use the normal metaslab_load/unload
-		 * interfaces.
-		 */
-		zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
-		load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+	zcb->zcb_vd_obsolete_counts =
+	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+	    UMEM_NOFAIL);

-		/*
-		 * On load_concrete_ms_allocatable_trees() we loaded all the
-		 * allocated entries from the ms_sm to the ms_allocatable for
-		 * each metaslab. If the pool has a checkpoint or is in the
-		 * middle of discarding a checkpoint, some of these blocks
-		 * may have been freed but their ms_sm may not have been
-		 * updated because they are referenced by the checkpoint. In
-		 * order to avoid false-positives during leak-detection, we
-		 * go through the vdev's checkpoint space map and exclude all
-		 * its entries from their relevant ms_allocatable.
-		 *
-		 * We also aggregate the space held by the checkpoint and add
-		 * it to zcb_checkpoint_size.
-		 *
-		 * Note that at this point we are also verifying that all the
-		 * entries on the checkpoint_sm are marked as allocated in
-		 * the ms_sm of their relevant metaslab.
-		 * [see comment in checkpoint_sm_exclude_entry_cb()]
-		 */
-		zdb_leak_init_exclude_checkpoint(spa, zcb);
+	/*
+	 * For leak detection, we overload the ms_allocatable trees
+	 * to contain allocated segments instead of free segments.
+	 * As a result, we can't use the normal metaslab_load/unload
+	 * interfaces.
+	 */
+	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);

-		/* for cleaner progress output */
-		(void) fprintf(stderr, "\n");
+	/*
+	 * On load_concrete_ms_allocatable_trees() we loaded all the
+	 * allocated entries from the ms_sm to the ms_allocatable for
+	 * each metaslab. If the pool has a checkpoint or is in the
+	 * middle of discarding a checkpoint, some of these blocks
+	 * may have been freed but their ms_sm may not have been
+	 * updated because they are referenced by the checkpoint. In
+	 * order to avoid false-positives during leak-detection, we
+	 * go through the vdev's checkpoint space map and exclude all
+	 * its entries from their relevant ms_allocatable.
+	 *
+	 * We also aggregate the space held by the checkpoint and add
+	 * it to zcb_checkpoint_size.
+	 *
+	 * Note that at this point we are also verifying that all the
+	 * entries on the checkpoint_sm are marked as allocated in
+	 * the ms_sm of their relevant metaslab.
+	 * [see comment in checkpoint_sm_exclude_entry_cb()]
+	 */
+	zdb_leak_init_exclude_checkpoint(spa, zcb);
+	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));

-		if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
-			ASSERT(spa_feature_is_enabled(spa,
-			    SPA_FEATURE_DEVICE_REMOVAL));
-			(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
-			    increment_indirect_mapping_cb, zcb, NULL);
-		}
-	} else {
-		/*
-		 * If leak tracing is disabled, we still need to consider
-		 * any checkpointed space in our space verification.
-		 */
-		zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
+	/* for cleaner progress output */
+	(void) fprintf(stderr, "\n");
+
+	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_DEVICE_REMOVAL));
+		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+		    increment_indirect_mapping_cb, zcb, NULL);
 	}

 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@ -3690,52 +3694,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
-	boolean_t leaks = B_FALSE;
-	if (!dump_opt['L']) {
-		vdev_t *rvd = spa->spa_root_vdev;
-		for (unsigned c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *vd = rvd->vdev_child[c];
-			metaslab_group_t *mg = vd->vdev_mg;
+	if (dump_opt['L'])
+		return (B_FALSE);

-			if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
-				leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+	boolean_t leaks = B_FALSE;
+
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (unsigned c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+#if DEBUG
+		metaslab_group_t *mg = vd->vdev_mg;
+#endif
+
+		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+		}
+
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+			ASSERT3P(mg, ==, msp->ms_group);
+
+			/*
+			 * ms_allocatable has been overloaded
+			 * to contain allocated segments. Now that
+			 * we finished traversing all blocks, any
+			 * block that remains in the ms_allocatable
+			 * represents an allocated block that we
+			 * did not claim during the traversal.
+			 * Claimed blocks would have been removed
+			 * from the ms_allocatable.  For indirect
+			 * vdevs, space remaining in the tree
+			 * represents parts of the mapping that are
+			 * not referenced, which is not a bug.
+			 */
+			if (vd->vdev_ops == &vdev_indirect_ops) {
+				range_tree_vacate(msp->ms_allocatable,
+				    NULL, NULL);
+			} else {
+				range_tree_vacate(msp->ms_allocatable,
+				    zdb_leak, vd);
 			}

-			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
-				metaslab_t *msp = vd->vdev_ms[m];
-				ASSERT3P(mg, ==, msp->ms_group);
-
-				/*
-				 * ms_allocatable has been overloaded
-				 * to contain allocated segments. Now that
-				 * we finished traversing all blocks, any
-				 * block that remains in the ms_allocatable
-				 * represents an allocated block that we
-				 * did not claim during the traversal.
-				 * Claimed blocks would have been removed
-				 * from the ms_allocatable.  For indirect
-				 * vdevs, space remaining in the tree
-				 * represents parts of the mapping that are
-				 * not referenced, which is not a bug.
-				 */
-				if (vd->vdev_ops == &vdev_indirect_ops) {
-					range_tree_vacate(msp->ms_allocatable,
-					    NULL, NULL);
-				} else {
-					range_tree_vacate(msp->ms_allocatable,
-					    zdb_leak, vd);
-				}
-
-				if (msp->ms_loaded) {
-					msp->ms_loaded = B_FALSE;
-				}
+			if (msp->ms_loaded) {
+				msp->ms_loaded = B_FALSE;
 			}
 		}

-		umem_free(zcb->zcb_vd_obsolete_counts,
-		    rvd->vdev_children * sizeof (uint32_t *));
-		zcb->zcb_vd_obsolete_counts = NULL;
 	}
+
+	umem_free(zcb->zcb_vd_obsolete_counts,
+	    rvd->vdev_children * sizeof (uint32_t *));
+	zcb->zcb_vd_obsolete_counts = NULL;
+
 	return (leaks);
 }

@ -3774,13 +3784,18 @@ dump_block_stats(spa_t *spa)
 	    !dump_opt['L'] ? "nothing leaked " : "");

 	/*
-	 * Load all space maps as SM_ALLOC maps, then traverse the pool
-	 * claiming each block we discover.  If the pool is perfectly
-	 * consistent, the space maps will be empty when we're done.
-	 * Anything left over is a leak; any block we can't claim (because
-	 * it's not part of any space map) is a double allocation,
-	 * reference to a freed block, or an unclaimed log block.
+	 * When leak detection is enabled we load all space maps as SM_ALLOC
+	 * maps, then traverse the pool claiming each block we discover. If
+	 * the pool is perfectly consistent, the segment trees will be empty
+	 * when we're done. Anything left over is a leak; any block we can't
+	 * claim (because it's not part of any space map) is a double
+	 * allocation, reference to a freed block, or an unclaimed log block.
+	 *
+	 * When leak detection is disabled (-L option) we still traverse the
+	 * pool claiming each block we discover, but we skip opening any space
+	 * maps.
 	 */
+	bzero(&zcb, sizeof (zdb_cb_t));
 	zdb_leak_init(spa, &zcb);

 	/*
@ -3859,11 +3874,10 @@ dump_block_stats(spa_t *spa)
 	total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
 	    zcb.zcb_removing_size + zcb.zcb_checkpoint_size;

-	if (total_found == total_alloc) {
-		if (!dump_opt['L'])
-			(void) printf("\n\tNo leaks (block sum matches space"
-			    " maps exactly)\n");
-	} else {
+	if (total_found == total_alloc && !dump_opt['L']) {
+		(void) printf("\n\tNo leaks (block sum matches space"
+		    " maps exactly)\n");
+	} else if (!dump_opt['L']) {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
@ -4203,7 +4217,6 @@ verify_device_removal_feature_counts(spa_t *spa)
 			    spa->spa_meta_objset,
 			    scip->scip_prev_obsolete_sm_object,
 			    0, vd->vdev_asize, 0));
-			space_map_update(prev_obsolete_sm);
 			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
 			(void) printf("\n");
 			space_map_close(prev_obsolete_sm);
@ -4409,7 +4422,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+	range_tree_verify_not_present(ms->ms_allocatable,
+	    sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);

 	return (0);
@ -4472,7 +4486,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
 		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
 		    current_vd->vdev_ashift));
-		space_map_update(checkpoint_sm);

 		verify_checkpoint_sm_entry_cb_arg_t vcsec;
 		vcsec.vcsec_vd = ckpoint_vd;
@ -4480,6 +4493,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
 		vcsec.vcsec_num_entries =
 		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
 		VERIFY0(space_map_iterate(checkpoint_sm,
+		    space_map_length(checkpoint_sm),
 		    verify_checkpoint_sm_entry_cb, &vcsec));
 		dump_spacemap(current->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
@ -4559,7 +4573,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 			 * are part of the checkpoint were freed by mistake.
 			 */
 			range_tree_walk(ckpoint_msp->ms_allocatable,
-			    (range_tree_func_t *)range_tree_verify,
+			    (range_tree_func_t *)range_tree_verify_not_present,
 			    current_msp->ms_allocatable);
 		}
 	}
@ -4571,6 +4585,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
+	ASSERT(!dump_opt['L']);
+
 	spa_t *checkpoint_spa;
 	char *checkpoint_pool;
 	nvlist_t *config = NULL;
@ -4636,7 +4652,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa)

 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-		space_map_update(checkpoint_sm);
 		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@ -584,45 +584,62 @@ metaslab_compare(const void *x1, const void *x2)
 	return (AVL_CMP(m1->ms_start, m2->ms_start));
 }

+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+	return (msp->ms_allocated_space);
+}
+
 /*
 * Verify that the space accounting on disk matches the in-core range_trees.
 */
-void
+static void
 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	uint64_t allocated = 0;
+	uint64_t allocating = 0;
 	uint64_t sm_free_space, msp_free_space;

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_condensing);

 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;

 	/*
 	 * We can only verify the metaslab space when we're called
-	 * from syncing context with a loaded metaslab that has an allocated
-	 * space map. Calling this in non-syncing context does not
-	 * provide a consistent view of the metaslab since we're performing
-	 * allocations in the future.
+	 * from syncing context with a loaded metaslab that has an
+	 * allocated space map. Calling this in non-syncing context
+	 * does not provide a consistent view of the metaslab since
+	 * we're performing allocations in the future.
 	 */
 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 	    !msp->ms_loaded)
 		return;

-	sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
-	    space_map_alloc_delta(msp->ms_sm);
+	/*
+	 * Even though the smp_alloc field can get negative (e.g.
+	 * see vdev_checkpoint_sm), that should never be the case
+	 * when it come's to a metaslab's space map.
+	 */
+	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);

 	/*
-	 * Account for future allocations since we would have already
-	 * deducted that space from the ms_freetree.
+	 * Account for future allocations since we would have
+	 * already deducted that space from the ms_allocatable.
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
-		allocated +=
+		allocating +=
 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}

-	msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+	ASSERT3U(msp->ms_deferspace, ==,
+	    range_tree_space(msp->ms_defer[0]) +
+	    range_tree_space(msp->ms_defer[1]));
+
+	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 	    msp->ms_deferspace + range_tree_space(msp->ms_freed);

 	VERIFY3U(sm_free_space, ==, msp_free_space);
@ -929,6 +946,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)

 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
+		ASSERT(msp != NULL);

 		/* skip if not active or not a member */
 		if (msp->ms_sm == NULL || msp->ms_group != mg)
@ -1470,6 +1488,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 * ==========================================================================
 */

+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+	/*
+	 * Auxiliary histograms are only cleared when resetting them,
+	 * which can only happen while the metaslab is loaded.
+	 */
+	ASSERT(msp->ms_loaded);
+
+	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+    range_tree_t *rt)
+{
+	/*
+	 * This is modeled after space_map_histogram_add(), so refer to that
+	 * function for implementation details. We want this to work like
+	 * the space map histogram, and not the range tree histogram, as we
+	 * are essentially constructing a delta that will be later subtracted
+	 * from the space map histogram.
+	 */
+	int idx = 0;
+	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+		ASSERT3U(i, >=, idx + shift);
+		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+			ASSERT3U(idx + shift, ==, i);
+			idx++;
+			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+		}
+	}
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_sm;
+	ASSERT(sm != NULL);
+
+	/*
+	 * This is similar to the metaslab's space map histogram updates
+	 * that take place in metaslab_sync(). The only difference is that
+	 * we only care about segments that haven't made it into the
+	 * ms_allocatable tree yet.
+	 */
+	if (msp->ms_loaded) {
+		metaslab_aux_histograms_clear(msp);
+
+		metaslab_aux_histogram_add(msp->ms_synchist,
+		    sm->sm_shift, msp->ms_freed);
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			metaslab_aux_histogram_add(msp->ms_deferhist[t],
+			    sm->sm_shift, msp->ms_defer[t]);
+		}
+	}
+
+	metaslab_aux_histogram_add(msp->ms_synchist,
+	    sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	space_map_t *sm = msp->ms_sm;
+
+	if (sm == NULL) {
+		/*
+		 * We came here from metaslab_init() when creating/opening a
+		 * pool, looking at a metaslab that hasn't had any allocations
+		 * yet.
+		 */
+		return;
+	}
+
+	/*
+	 * This is similar to the actions that we take for the ms_freed
+	 * and ms_defer trees in metaslab_sync_done().
+	 */
+	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+	if (defer_allowed) {
+		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+		    sizeof (msp->ms_synchist));
+	} else {
+		bzero(msp->ms_deferhist[hist_index],
+		    sizeof (msp->ms_deferhist[hist_index]));
+	}
+	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	/* see comment in metaslab_verify_unflushed_changes() */
+	if (msp->ms_group == NULL)
+		return;
+
+	/*
+	 * Devices being removed always return a weight of 0 and leave
+	 * fragmentation and ms_max_size as is - there is nothing for
+	 * us to verify here.
+	 */
+	vdev_t *vd = msp->ms_group->mg_vd;
+	if (vd->vdev_removing)
+		return;
+
+	/*
+	 * If the metaslab is dirty it probably means that we've done
+	 * some allocations or frees that have changed our histograms
+	 * and thus the weight.
+	 */
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&vd->vdev_ms_list, msp, t))
+			return;
+	}
+
+	/*
+	 * This verification checks that our in-memory state is consistent
+	 * with what's on disk. If the pool is read-only then there aren't
+	 * any changes and we just have the initially-loaded state.
+	 */
+	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+		return;
+
+	/* some extra verification for in-core tree if you can */
+	if (msp->ms_loaded) {
+		range_tree_stat_verify(msp->ms_allocatable);
+		VERIFY(space_map_histogram_verify(msp->ms_sm,
+		    msp->ms_allocatable));
+	}
+
+	uint64_t weight = msp->ms_weight;
+	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+	uint64_t frag = msp->ms_fragmentation;
+	uint64_t max_segsize = msp->ms_max_size;
+
+	msp->ms_weight = 0;
+	msp->ms_fragmentation = 0;
+	msp->ms_max_size = 0;
+
+	/*
+	 * This function is used for verification purposes. Regardless of
+	 * whether metaslab_weight() thinks this metaslab should be active or
+	 * not, we want to ensure that the actual weight (and therefore the
+	 * value of ms_weight) would be the same if it was to be recalculated
+	 * at this point.
+	 */
+	msp->ms_weight = metaslab_weight(msp) | was_active;
+
+	VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+	/*
+	 * If the weight type changed then there is no point in doing
+	 * verification. Revert fields to their original values.
+	 */
+	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+		msp->ms_fragmentation = frag;
+		msp->ms_weight = weight;
+		return;
+	}
+
+	VERIFY3U(msp->ms_fragmentation, ==, frag);
+	VERIFY3U(msp->ms_weight, ==, weight);
+}
+
 /*
 * Wait for any in-progress metaslab loads to complete.
 */
@ -1491,47 +1706,94 @@ metaslab_load_impl(metaslab_t *msp)

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loading);
+	ASSERT(!msp->ms_condensing);

 	/*
-	 * Nobody else can manipulate a loading metaslab, so it's now safe
-	 * to drop the lock. This way we don't have to hold the lock while
-	 * reading the spacemap from disk.
+	 * We temporarily drop the lock to unblock other operations while we
+	 * are reading the space map. Therefore, metaslab_sync() and
+	 * metaslab_sync_done() can run at the same time as we do.
+	 *
+	 * metaslab_sync() can append to the space map while we are loading.
+	 * Therefore we load only entries that existed when we started the
+	 * load. Additionally, metaslab_sync_done() has to wait for the load
+	 * to complete because there are potential races like metaslab_load()
+	 * loading parts of the space map that are currently being appended
+	 * by metaslab_sync(). If we didn't, the ms_allocatable would have
+	 * entries that metaslab_sync_done() would try to re-add later.
+	 *
+	 * That's why before dropping the lock we remember the synced length
+	 * of the metaslab and read up to that point of the space map,
+	 * ignoring entries appended by metaslab_sync() that happen after we
+	 * drop the lock.
 	 */
+	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);

-	/*
-	 * If the space map has not been allocated yet, then treat
-	 * all the space in the metaslab as free and add it to ms_allocatable.
-	 */
 	if (msp->ms_sm != NULL) {
-		error = space_map_load(msp->ms_sm, msp->ms_allocatable,
-		    SM_FREE);
+		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+		    SM_FREE, length);
 	} else {
+		/*
+		 * The space map has not been allocated yet, so treat
+		 * all the space in the metaslab as free and add it to the
+		 * ms_allocatable tree.
+		 */
 		range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
 	}

+	/*
+	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+	 * changing the ms_sm and the metaslab's range trees while we are
+	 * about to use them and populate the ms_allocatable. The ms_lock
+	 * is insufficient for this because metaslab_sync() doesn't hold
+	 * the ms_lock while writing the ms_checkpointing tree to disk.
+	 */
+	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
+	ASSERT(!msp->ms_condensing);

-	if (error != 0)
+	if (error != 0) {
+		mutex_exit(&msp->ms_sync_lock);
 		return (error);
+	}

 	ASSERT3P(msp->ms_group, !=, NULL);
 	msp->ms_loaded = B_TRUE;

 	/*
-	 * If the metaslab already has a spacemap, then we need to
-	 * remove all segments from the defer tree; otherwise, the
-	 * metaslab is completely empty and we can skip this.
+	 * The ms_allocatable contains the segments that exist in the
+	 * ms_defer trees [see ms_synced_length]. Thus we need to remove
+	 * them from ms_allocatable as they will be added again in
+	 * metaslab_sync_done().
 	 */
-	if (msp->ms_sm != NULL) {
-		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			range_tree_walk(msp->ms_defer[t],
-			    range_tree_remove, msp->ms_allocatable);
-		}
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		range_tree_walk(msp->ms_defer[t],
+		    range_tree_remove, msp->ms_allocatable);
 	}
+
+	/*
+	 * Call metaslab_recalculate_weight_and_sort() now that the
+	 * metaslab is loaded so we get the metaslab's real weight.
+	 *
+	 * Unless this metaslab was created with older software and
+	 * has not yet been converted to use segment-based weight, we
+	 * expect the new weight to be better or equal to the weight
+	 * that the metaslab had while it was not loaded. This is
+	 * because the old weight does not take into account the
+	 * consolidation of adjacent segments between TXGs. [see
+	 * comment for ms_synchist and ms_deferhist[] for more info]
+	 */
+	uint64_t weight = msp->ms_weight;
+	metaslab_recalculate_weight_and_sort(msp);
+	if (!WEIGHT_IS_SPACEBASED(weight))
+		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_block_maxsize(msp);

+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	metaslab_verify_space(msp, spa_syncing_txg(spa));
+	mutex_exit(&msp->ms_sync_lock);
+
 	return (0);
 }

@ -1548,6 +1810,7 @@ metaslab_load(metaslab_t *msp)
 	if (msp->ms_loaded)
 		return (0);
 	VERIFY(!msp->ms_loading);
+	ASSERT(!msp->ms_condensing);

 	msp->ms_loading = B_TRUE;
 	int error = metaslab_load_impl(msp);
@ -1561,10 +1824,29 @@ void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	metaslab_verify_weight_and_frag(msp);
+
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
+
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 	msp->ms_max_size = 0;
+
+	/*
+	 * We explicitly recalculate the metaslab's weight based on its space
+	 * map (as it is now not loaded). We want unload metaslabs to always
+	 * have their weights calculated from the space map histograms, while
+	 * loaded ones have it calculated from their in-core range tree
+	 * [see metaslab_load()]. This way, the weight reflects the information
+	 * available in-core, whether it is loaded or not
+	 *
+	 * If ms_group == NULL means that we came here from metaslab_fini(),
+	 * at which point it doesn't make sense for us to do the recalculation
+	 * and the sorting.
+	 */
+	if (msp->ms_group != NULL)
+		metaslab_recalculate_weight_and_sort(msp);
 }

 static void
@ -1604,6 +1886,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it.
+	 *
+	 * Note:
+	 * When called from vdev_expand(), we can't call into the DMU as
+	 * we are holding the spa_config_lock as a writer and we would
+	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
+	 * that case, the object parameter is zero though, so we won't
+	 * call into the DMU.
 	 */
 	if (object != 0) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
@ -1615,14 +1904,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 		}

 		ASSERT(ms->ms_sm != NULL);
+		ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
+		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}

 	/*
-	 * We create the main range tree here, but we don't create the
+	 * We create the ms_allocatable here, but we don't create the
 	 * other range trees until metaslab_sync_done().  This serves
 	 * two purposes: it allows metaslab_sync_done() to detect the
-	 * addition of new space; and for debugging, it ensures that we'd
-	 * data fault on any attempt to use this metaslab before it's ready.
+	 * addition of new space; and for debugging, it ensures that
+	 * we'd data fault on any attempt to use this metaslab before
+	 * it's ready.
 	 */
 	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
 	    metaslab_rangesize_compare, 0);
@ -1639,8 +1931,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	 * out this txg. This ensures that we don't attempt to allocate
 	 * from it before we have initialized it completely.
 	 */
-	if (txg <= TXG_INITIAL)
+	if (txg <= TXG_INITIAL) {
 		metaslab_sync_done(ms, 0);
+		metaslab_space_update(vd, mg->mg_class,
+		    metaslab_allocated_space(ms), 0, 0);
+	}

 	/*
 	 * If metaslab_debug_load is set and we're initializing a metaslab
@ -1674,7 +1969,7 @@ metaslab_fini(metaslab_t *msp)
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
 	metaslab_space_update(vd, mg->mg_class,
-	    -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+	    -metaslab_allocated_space(msp), 0, -msp->ms_size);

 	space_map_close(msp->ms_sm);

@ -1695,6 +1990,9 @@ metaslab_fini(metaslab_t *msp)

 	range_tree_destroy(msp->ms_checkpointing);

+	for (int t = 0; t < TXG_SIZE; t++)
+		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
@ -1710,7 +2008,7 @@ metaslab_fini(metaslab_t *msp)
 * This table defines a segment size based fragmentation metric that will
 * allow each metaslab to derive its own fragmentation value. This is done
 * by calculating the space in each bucket of the spacemap histogram and
- * multiplying that by the fragmetation metric in this table. Doing
+ * multiplying that by the fragmentation metric in this table. Doing
 * this for all buckets and dividing it by the total amount of free
 * space in this metaslab (i.e. the total free space in all buckets) gives
 * us the fragmentation metric. This means that a high fragmentation metric
@ -1745,10 +2043,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 };

 /*
- * Calclate the metaslab's fragmentation metric. A return value
- * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
- * not support this metric. Otherwise, the return value should be in the
- * range [0, 100].
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
 */
 static void
 metaslab_set_fragmentation(metaslab_t *msp)
@ -1841,7 +2139,7 @@ metaslab_space_weight(metaslab_t *msp)
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
-	space = msp->ms_size - space_map_allocated(msp->ms_sm);
+	space = msp->ms_size - metaslab_allocated_space(msp);

 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
@ -1945,14 +2243,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
 {
-	uint64_t weight = 0;
+	space_map_t *sm = msp->ms_sm;
+	ASSERT(!msp->ms_loaded);
+	ASSERT(sm != NULL);
+	ASSERT3U(space_map_object(sm), !=, 0);
+	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));

+	/*
+	 * Create a joint histogram from all the segments that have made
+	 * it to the metaslab's space map histogram, that are not yet
+	 * available for allocation because they are still in the freeing
+	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+	 * these segments from the space map's histogram to get a more
+	 * accurate weight.
+	 */
+	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+		deferspace_histogram[i] += msp->ms_synchist[i];
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+			deferspace_histogram[i] += msp->ms_deferhist[t][i];
+		}
+	}
+
+	uint64_t weight = 0;
 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
-		if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
-			WEIGHT_SET_COUNT(weight,
-			    msp->ms_sm->sm_phys->smp_histogram[i]);
-			WEIGHT_SET_INDEX(weight, i +
-			    msp->ms_sm->sm_shift);
+		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+		    deferspace_histogram[i]);
+		uint64_t count =
+		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+		if (count != 0) {
+			WEIGHT_SET_COUNT(weight, count);
+			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
@ -1977,7 +2299,7 @@ metaslab_segment_weight(metaslab_t *msp)
 	/*
 	 * The metaslab is completely free.
 	 */
-	if (space_map_allocated(msp->ms_sm) == 0) {
+	if (metaslab_allocated_space(msp) == 0) {
 		int idx = highbit64(msp->ms_size) - 1;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;

@ -1999,7 +2321,7 @@ metaslab_segment_weight(metaslab_t *msp)
 	/*
 	 * If the metaslab is fully allocated then just make the weight 0.
 	 */
-	if (space_map_allocated(msp->ms_sm) == msp->ms_size)
+	if (metaslab_allocated_space(msp) == msp->ms_size)
 		return (0);
 	/*
 	 * If the metaslab is already loaded, then use the range tree to
@ -2080,6 +2402,8 @@ metaslab_weight(metaslab_t *msp)
 	 */
 	if (msp->ms_loaded)
 		msp->ms_max_size = metaslab_block_maxsize(msp);
+	else
+		ASSERT0(msp->ms_max_size);

 	/*
 	 * Segment-based weighting requires space map histogram support.
@ -2095,6 +2419,15 @@ metaslab_weight(metaslab_t *msp)
 	return (weight);
 }

+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+	/* note: we preserve the mask (e.g. indication of primary, etc..) */
+	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+	metaslab_group_sort(msp->ms_group, msp,
+	    metaslab_weight(msp) | was_active);
+}
+
 static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
    int allocator, uint64_t activation_weight)
@ -2479,17 +2812,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	VERIFY(txg <= spa_final_dirty_txg(spa));

 	/*
-	 * The only state that can actually be changing concurrently with
-	 * metaslab_sync() is the metaslab's ms_allocatable.  No other
-	 * thread can be modifying this txg's alloc, freeing,
+	 * The only state that can actually be changing concurrently
+	 * with metaslab_sync() is the metaslab's ms_allocatable. No
+	 * other thread can be modifying this txg's alloc, freeing,
 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
-	 * could call into the DMU, because the DMU can call down to us
-	 * (e.g. via zio_free()) at any time.
+	 * could call into the DMU, because the DMU can call down to
+	 * us (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
-	 * concurrently, and it is locked out by the ms_sync_lock.  Note
-	 * that the ms_lock is insufficient for this, because it is dropped
-	 * by space_map_write().
+	 * concurrently, and it is locked out by the ms_sync_lock.
+	 * Note that the ms_lock is insufficient for this, because it
+	 * is dropped by space_map_write().
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);

@ -2501,7 +2834,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)

 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
+
 		ASSERT(msp->ms_sm != NULL);
+		ASSERT0(metaslab_allocated_space(msp));
 	}

 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
@ -2549,6 +2884,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		mutex_enter(&msp->ms_lock);
 	}

+	msp->ms_allocated_space += range_tree_space(alloctree);
+	ASSERT3U(msp->ms_allocated_space, >=,
+	    range_tree_space(msp->ms_freeing));
+	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
 	if (!range_tree_is_empty(msp->ms_checkpointing)) {
 		ASSERT(spa_has_checkpoint(spa));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
@ -2562,14 +2902,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		space_map_write(vd->vdev_checkpoint_sm,
 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
-		space_map_update(vd->vdev_checkpoint_sm);

 		spa->spa_checkpoint_info.sci_dspace +=
 		    range_tree_space(msp->ms_checkpointing);
 		vd->vdev_stat.vs_checkpoint_space +=
 		    range_tree_space(msp->ms_checkpointing);
 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
-		    -vd->vdev_checkpoint_sm->sm_alloc);
+		    -space_map_allocated(vd->vdev_checkpoint_sm));

 		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 	}
@ -2614,6 +2953,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * time we load the space map.
 	 */
 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+	metaslab_aux_histograms_update(msp);

 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
@ -2621,16 +2961,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)

 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
-	 * and instead will just swap the pointers for freeing and
-	 * freed. We can safely do this since the freed_tree is
-	 * guaranteed to be empty on the initial pass.
+	 * and instead will just swap the pointers for freeing and freed.
+	 * We can safely do this since the freed_tree is guaranteed to be
+	 * empty on the initial pass.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+		ASSERT0(msp->ms_allocated_this_txg);
 	} else {
 		range_tree_vacate(msp->ms_freeing,
 		    range_tree_add, msp->ms_freed);
 	}
+	msp->ms_allocated_this_txg += range_tree_space(alloctree);
 	range_tree_vacate(alloctree, NULL, NULL);

 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
@ -2708,7 +3050,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	}

 	defer_delta = 0;
-	alloc_delta = space_map_alloc_delta(msp->ms_sm);
+	alloc_delta = msp->ms_allocated_this_txg -
+	    range_tree_space(msp->ms_freed);
 	if (defer_allowed) {
 		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
@ -2740,7 +3083,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		    msp->ms_loaded ? range_tree_add : NULL,
 		    msp->ms_allocatable);
 	}
-	space_map_update(msp->ms_sm);
+
+	msp->ms_synced_length = space_map_length(msp->ms_sm);

 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
@ -2752,6 +3096,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
+	metaslab_aux_histograms_update_done(msp, defer_allowed);

 	if (msp->ms_new) {
 		msp->ms_new = B_FALSE;
@ -2759,12 +3104,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		mg->mg_ms_ready++;
 		mutex_exit(&mg->mg_lock);
 	}
+
 	/*
-	 * Calculate the new weights before unloading any metaslabs.
-	 * This will give us the most accurate weighting.
+	 * Re-sort metaslab within its group now that we've adjusted
+	 * its allocatable space.
 	 */
-	metaslab_group_sort(mg, msp, metaslab_weight(msp) |
-	    (msp->ms_weight & METASLAB_ACTIVE_MASK));
+	metaslab_recalculate_weight_and_sort(msp);

 	/*
 	 * If the metaslab is loaded and we've not tried to load or allocate
@ -2791,6 +3136,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	ASSERT0(range_tree_space(msp->ms_freed));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));

+	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }

@ -4073,7 +4419,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
    zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
-	dva_t *hintdva = hintbp->blk_dva;
+	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;

 	ASSERT(bp->blk_birth == 0);
@ -4240,14 +4586,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];

 	mutex_enter(&msp->ms_lock);
-	if (msp->ms_loaded)
-		range_tree_verify(msp->ms_allocatable, offset, size);
+	if (msp->ms_loaded) {
+		range_tree_verify_not_present(msp->ms_allocatable,
+		    offset, size);
+	}

-	range_tree_verify(msp->ms_freeing, offset, size);
-	range_tree_verify(msp->ms_checkpointing, offset, size);
-	range_tree_verify(msp->ms_freed, offset, size);
+	range_tree_verify_not_present(msp->ms_freeing, offset, size);
+	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
+	range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
-		range_tree_verify(msp->ms_defer[j], offset, size);
+		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
 	mutex_exit(&msp->ms_lock);
 }

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
@ -511,13 +511,11 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 }

 void
-range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
+range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
 {
-	range_seg_t *rs;
-
-	rs = range_tree_find(rt, off, size);
+	range_seg_t *rs = range_tree_find(rt, off, size);
 	if (rs != NULL)
-		panic("freeing free block; rs=%p", (void *)rs);
+		panic("segment already in tree; rs=%p", (void *)rs);
 }

 boolean_t
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
@ -129,7 +129,7 @@
 *   uberblock would reference data in the removed device. For this reason
 *   and others of similar nature, we disallow the following operations that
 *   can change the config:
- *   	vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *	vdev removal and attach/detach, mirror splitting, and pool reguid.
 *
 * - As most of the checkpoint logic is implemented in the SPA and doesn't
 *   distinguish datasets when it comes to space accounting, having a
@ -262,7 +262,7 @@ spa_checkpoint_accounting_verify(spa_t *spa)

 		if (vd->vdev_checkpoint_sm != NULL) {
 			ckpoint_sm_space_sum +=
-			    -vd->vdev_checkpoint_sm->sm_alloc;
+			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vs_ckpoint_space_sum +=
 			    vd->vdev_stat.vs_checkpoint_space;
 			ASSERT3U(ckpoint_sm_space_sum, ==,
@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 			    error, vd->vdev_id);
 		}
 		ASSERT0(words_after);
-		ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
+		ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
 		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));

 		space_map_free(vd->vdev_checkpoint_sm, tx);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@ -23,7 +23,7 @@
 * Use is subject to license terms.
 */
 /*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -86,20 +86,22 @@ sm_entry_is_double_word(uint64_t e)

 /*
 * Iterate through the space map, invoking the callback on each (non-debug)
- * space map entry.
+ * space map entry. Stop after reading 'end' bytes of the space map.
 */
 int
-space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
 {
-	uint64_t sm_len = space_map_length(sm);
-	ASSERT3U(sm->sm_blksz, !=, 0);
+	uint64_t blksz = sm->sm_blksz;

-	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+	ASSERT3U(blksz, !=, 0);
+	ASSERT3U(end, <=, space_map_length(sm));
+	ASSERT0(P2PHASE(end, sizeof (uint64_t)));
+
+	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
 	    ZIO_PRIORITY_SYNC_READ);

-	uint64_t blksz = sm->sm_blksz;
 	int error = 0;
-	for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+	for (uint64_t block_base = 0; block_base < end && error == 0;
 	    block_base += blksz) {
 		dmu_buf_t *db;
 		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
@ -108,7 +110,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 			return (error);

 		uint64_t *block_start = db->db_data;
-		uint64_t block_length = MIN(sm_len - block_base, blksz);
+		uint64_t block_length = MIN(end - block_base, blksz);
 		uint64_t *block_end = block_start +
 		    (block_length / sizeof (uint64_t));

@ -191,7 +193,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
 	 * dmu_buf_hold().
 	 */
 	uint64_t last_word_offset =
-	    sm->sm_phys->smp_objsize - sizeof (uint64_t);
+	    sm->sm_phys->smp_length - sizeof (uint64_t);
 	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
 	    FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (error != 0)
@ -204,7 +206,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,

 	uint64_t *words = db->db_data;
 	*nwords =
-	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);

 	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));

@ -303,8 +305,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
 			uint64_t e = buf[i];

 			if (sm_entry_is_debug(e)) {
-				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
-				space_map_update(sm);
+				sm->sm_phys->smp_length -= sizeof (uint64_t);
 				continue;
 			}

@ -359,15 +360,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
 				sm->sm_phys->smp_alloc -= entry_run;
 			else
 				sm->sm_phys->smp_alloc += entry_run;
-			sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
-			space_map_update(sm);
+			sm->sm_phys->smp_length -= words * sizeof (uint64_t);
 		}
 	}

 	if (space_map_length(sm) == 0) {
 		ASSERT0(error);
-		ASSERT0(sm->sm_phys->smp_objsize);
-		ASSERT0(sm->sm_alloc);
+		ASSERT0(space_map_allocated(sm));
 	}

 	zio_buf_free(buf, bufsz);
@ -395,6 +394,33 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
 	return (0);
 }

+/*
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
+ */
+int
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t length)
+{
+	space_map_load_arg_t smla;
+
+	VERIFY0(range_tree_space(rt));
+
+	if (maptype == SM_FREE)
+		range_tree_add(rt, sm->sm_start, sm->sm_size);
+
+	smla.smla_rt = rt;
+	smla.smla_sm = sm;
+	smla.smla_type = maptype;
+	int err = space_map_iterate(sm, length,
+	    space_map_load_callback, &smla);
+
+	if (err != 0)
+		range_tree_vacate(rt, NULL, NULL);
+
+	return (err);
+}
+
 /*
 * Load the space map disk into the specified range tree. Segments of maptype
 * are added to the range tree, other segment types are removed.
@ -402,30 +428,7 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
 int
 space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
 {
-	uint64_t space;
-	int err;
-	space_map_load_arg_t smla;
-
-	VERIFY0(range_tree_space(rt));
-	space = space_map_allocated(sm);
-
-	if (maptype == SM_FREE) {
-		range_tree_add(rt, sm->sm_start, sm->sm_size);
-		space = sm->sm_size - space;
-	}
-
-	smla.smla_rt = rt;
-	smla.smla_sm = sm;
-	smla.smla_type = maptype;
-	err = space_map_iterate(sm, space_map_load_callback, &smla);
-
-	if (err == 0) {
-		VERIFY3U(range_tree_space(rt), ==, space);
-	} else {
-		range_tree_vacate(rt, NULL, NULL);
-	}
-
-	return (err);
+	return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
 }

 void
@ -511,10 +514,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));

-	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
 	    sizeof (dentry), &dentry, tx);

-	sm->sm_phys->smp_objsize += sizeof (dentry);
+	sm->sm_phys->smp_length += sizeof (dentry);
 }

 /*
@ -546,7 +549,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
 	uint64_t *block_base = db->db_data;
 	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
 	uint64_t *block_cursor = block_base +
-	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);

 	ASSERT3P(block_cursor, <=, block_end);

@ -569,7 +572,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
 		if (block_cursor == block_end) {
 			dmu_buf_rele(db, tag);

-			uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+			uint64_t next_word_offset = sm->sm_phys->smp_length;
 			VERIFY0(dmu_buf_hold(sm->sm_os,
 			    space_map_object(sm), next_word_offset,
 			    tag, &db, DMU_READ_PREFETCH));
@ -599,7 +602,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
 			    SM_DEBUG_SYNCPASS_ENCODE(0) |
 			    SM_DEBUG_TXG_ENCODE(0);
 			block_cursor++;
-			sm->sm_phys->smp_objsize += sizeof (uint64_t);
+			sm->sm_phys->smp_length += sizeof (uint64_t);
 			ASSERT3P(block_cursor, ==, block_end);
 			continue;
 		}
@ -630,7 +633,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
 			    words);
 			break;
 		}
-		sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+		sm->sm_phys->smp_length += words * sizeof (uint64_t);

 		start += run_len;
 		size -= run_len;
@ -657,7 +660,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 * We do this right after we write the intro debug entry
 	 * because the estimate does not take it into account.
 	 */
-	uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+	uint64_t initial_objsize = sm->sm_phys->smp_length;
 	uint64_t estimated_growth =
 	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
 	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
@ -668,7 +671,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 * and use that to get a hold of the last block, so we can
 	 * start appending to it.
 	 */
-	uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+	uint64_t next_word_offset = sm->sm_phys->smp_length;
 	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
 	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
 	ASSERT3U(db->db_size, ==, sm->sm_blksz);
@ -716,7 +719,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 * Therefore we expect the actual objsize to be equal or less
 	 * than whatever we estimated it to be.
 	 */
-	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
 #endif
 }

@ -872,23 +875,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 	}

 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
-	sm->sm_phys->smp_objsize = 0;
+	sm->sm_phys->smp_length = 0;
 	sm->sm_phys->smp_alloc = 0;
 }

-/*
- * Update the in-core space_map allocation and length values.
- */
-void
-space_map_update(space_map_t *sm)
-{
-	if (sm == NULL)
-		return;
-
-	sm->sm_alloc = sm->sm_phys->smp_alloc;
-	sm->sm_length = sm->sm_phys->smp_objsize;
-}
-
 uint64_t
 space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
@ -1070,32 +1060,14 @@ space_map_object(space_map_t *sm)
 	return (sm != NULL ? sm->sm_object : 0);
 }

-/*
- * Returns the already synced, on-disk allocated space.
- */
-uint64_t
+int64_t
 space_map_allocated(space_map_t *sm)
 {
-	return (sm != NULL ? sm->sm_alloc : 0);
+	return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
 }

-/*
- * Returns the already synced, on-disk length;
- */
 uint64_t
 space_map_length(space_map_t *sm)
 {
-	return (sm != NULL ? sm->sm_length : 0);
-}
-
-/*
- * Returns the allocated space that is currently syncing.
- */
-int64_t
-space_map_alloc_delta(space_map_t *sm)
-{
-	if (sm == NULL)
-		return (0);
-	ASSERT(sm->sm_dbuf != NULL);
-	return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
+	return (sm != NULL ? sm->sm_phys->smp_length : 0);
 }
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *);
 int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);

+uint64_t metaslab_allocated_space(metaslab_t *);
+
 void metaslab_sync(metaslab_t *, uint64_t);
 void metaslab_sync_done(metaslab_t *, uint64_t);
 void metaslab_sync_reassess(metaslab_group_t *);
@ -116,6 +118,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
 void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
    boolean_t);
 void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
+void metaslab_recalculate_weight_and_sort(metaslab_t *);

 #ifdef	__cplusplus
 }
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@ -341,8 +341,34 @@ struct metaslab_group {
 * being written.
 */
 struct metaslab {
+	/*
+	 * This is the main lock of the metaslab and its purpose is to
+	 * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+	 * metaslab_free_concrete(), ..etc] with our various syncing
+	 * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+	 *
+	 * The lock is also used during some miscellaneous operations like
+	 * using the metaslab's histogram for the metaslab group's histogram
+	 * aggregation, or marking the metaslab for initialization.
+	 */
 	kmutex_t	ms_lock;
+
+	/*
+	 * Acquired together with the ms_lock whenever we expect to
+	 * write to metaslab data on-disk (i.e flushing entries to
+	 * the metaslab's space map). It helps coordinate readers of
+	 * the metaslab's space map [see spa_vdev_remove_thread()]
+	 * with writers [see metaslab_sync()].
+	 *
+	 * Note that metaslab_load(), even though a reader, uses
+	 * a completely different mechanism to deal with the reading
+	 * of the metaslab's space map based on ms_synced_length. That
+	 * said, the function still uses the ms_sync_lock after it
+	 * has read the ms_sm [see relevant comment in metaslab_load()
+	 * as to why].
+	 */
 	kmutex_t	ms_sync_lock;
+
 	kcondvar_t	ms_load_cv;
 	space_map_t	*ms_sm;
 	uint64_t	ms_id;
@ -352,6 +378,7 @@ struct metaslab {

 	range_tree_t	*ms_allocating[TXG_SIZE];
 	range_tree_t	*ms_allocatable;
+	uint64_t	ms_allocated_this_txg;

 	/*
 	 * The following range trees are accessed only from syncing context.
@ -376,6 +403,55 @@ struct metaslab {
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;

+	/*
+	 * The following histograms count entries that are in the
+	 * metaslab's space map (and its histogram) but are not in
+	 * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+	 * or ms_defer[].
+	 *
+	 * When the metaslab is not loaded, its ms_weight needs to
+	 * reflect what is allocatable (i.e. what will be part of
+	 * ms_allocatable if it is loaded).  The weight is computed from
+	 * the spacemap histogram, but that includes ranges that are
+	 * not yet allocatable (because they are in ms_freed,
+	 * ms_freeing, or ms_defer[]).  Therefore, when calculating the
+	 * weight, we need to remove those ranges.
+	 *
+	 * The ranges in the ms_freed and ms_defer[] range trees are all
+	 * present in the spacemap.  However, the spacemap may have
+	 * multiple entries to represent a contiguous range, because it
+	 * is written across multiple sync passes, but the changes of
+	 * all sync passes are consolidated into the range trees.
+	 * Adjacent ranges that are freed in different sync passes of
+	 * one txg will be represented separately (as 2 or more entries)
+	 * in the space map (and its histogram), but these adjacent
+	 * ranges will be consolidated (represented as one entry) in the
+	 * ms_freed/ms_defer[] range trees (and their histograms).
+	 *
+	 * When calculating the weight, we can not simply subtract the
+	 * range trees' histograms from the spacemap's histogram,
+	 * because the range trees' histograms may have entries in
+	 * higher buckets than the spacemap, due to consolidation.
+	 * Instead we must subtract the exact entries that were added to
+	 * the spacemap's histogram.  ms_synchist and ms_deferhist[]
+	 * represent these exact entries, so we can subtract them from
+	 * the spacemap's histogram when calculating ms_weight.
+	 *
+	 * ms_synchist represents the same ranges as ms_freeing +
+	 * ms_freed, but without consolidation across sync passes.
+	 *
+	 * ms_deferhist[i] represents the same ranges as ms_defer[i],
+	 * but without consolidation across sync passes.
+	 */
+	uint64_t	ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+	uint64_t	ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+	/*
+	 * Tracks the exact amount of allocated space of this metaslab
+	 * (and specifically the metaslab's space map) up to the most
+	 * recently completed sync pass [see usage in metaslab_sync()].
+	 */
+	uint64_t	ms_allocated_space;
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	uint64_t	ms_activation_weight;	/* activation weight	*/
@ -412,6 +488,9 @@ struct metaslab {
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/

+	/* updated every time we are done syncing the metaslab's space map */
+	uint64_t	ms_synced_length;
+
 	boolean_t	ms_new;
 };

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
@ -87,12 +87,13 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
 range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_verify_not_present(range_tree_t *rt,
+    uint64_t start, uint64_t size);
 range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
    uint64_t newstart, uint64_t newsize);
 uint64_t range_tree_space(range_tree_t *rt);
 boolean_t range_tree_is_empty(range_tree_t *rt);
-void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
 uint64_t range_tree_min(range_tree_t *rt);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@ -24,7 +24,7 @@
 */

 /*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 */

 #ifndef _SYS_SPACE_MAP_H
@ -55,10 +55,17 @@ extern "C" {
 * for backward compatibility.
 */
 typedef struct space_map_phys {
-	uint64_t	smp_object;	/* on-disk space map object */
-	uint64_t	smp_objsize;	/* size of the object */
-	int64_t		smp_alloc;	/* space allocated from the map */
-	uint64_t	smp_pad[5];	/* reserved */
+	/* object number: not needed but kept for backwards compatibility */
+	uint64_t	smp_object;
+
+	/* length of the object in bytes */
+	uint64_t	smp_length;
+
+	/* space allocated from the map */
+	int64_t		smp_alloc;
+
+	/* reserved */
+	uint64_t	smp_pad[5];

 	/*
 	 * The smp_histogram maintains a histogram of free regions. Each
@ -81,8 +88,6 @@ typedef struct space_map {
 	uint64_t	sm_start;	/* start of map */
 	uint64_t	sm_size;	/* size of map */
 	uint8_t		sm_shift;	/* unit shift */
-	uint64_t	sm_length;	/* synced length */
-	int64_t		sm_alloc;	/* synced space allocated */
 	objset_t	*sm_os;		/* objset for this map */
 	uint64_t	sm_object;	/* object id for this map */
 	uint32_t	sm_blksz;	/* block size for space map */
@ -189,18 +194,20 @@ boolean_t sm_entry_is_double_word(uint64_t e);
 typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);

 int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
-int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t length);
+int space_map_iterate(space_map_t *sm, uint64_t length,
+    sm_cb_t callback, void *arg);
 int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
    dmu_tx_t *tx);

+boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
 void space_map_histogram_clear(space_map_t *sm);
 void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
    dmu_tx_t *tx);

-void space_map_update(space_map_t *sm);
-
 uint64_t space_map_object(space_map_t *sm);
-uint64_t space_map_allocated(space_map_t *sm);
+int64_t space_map_allocated(space_map_t *sm);
 uint64_t space_map_length(space_map_t *sm);

 void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
@ -216,8 +223,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
    uint64_t start, uint64_t size, uint8_t shift);
 void space_map_close(space_map_t *sm);

-int64_t space_map_alloc_delta(space_map_t *sm);
-
 #ifdef	__cplusplus
 }
 #endif
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@ -268,7 +268,6 @@ struct vdev {
 	uint64_t	vdev_islog;	/* is an intent log device	*/
 	uint64_t	vdev_removing;	/* device is being removed?	*/
 	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
-	kmutex_t	vdev_queue_lock; /* protects vdev_queue_depth	*/
 	uint64_t	vdev_top_zap;
 	vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias	*/

@ -327,16 +326,6 @@ struct vdev {
 	range_tree_t	*vdev_obsolete_segments;
 	space_map_t	*vdev_obsolete_sm;

-	/*
-	 * The queue depth parameters determine how many async writes are
-	 * still pending (i.e. allocated but not yet issued to disk) per
-	 * top-level (vdev_async_write_queue_depth) and the maximum allowed
-	 * (vdev_max_async_write_queue_depth). These values only apply to
-	 * top-level vdevs.
-	 */
-	uint64_t	vdev_async_write_queue_depth;
-	uint64_t	vdev_max_async_write_queue_depth;
-
 	/*
 	 * Protects the vdev_scan_io_queue field itself as well as the
 	 * structure's contents (when present).
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@ -630,7 +630,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1032,7 +1031,6 @@ vdev_free(vdev_t *vd)
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);

-	mutex_destroy(&vd->vdev_queue_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@ -1401,12 +1399,12 @@ vdev_metaslab_fini(vdev_t *vd)
 	}

 	if (vd->vdev_ms != NULL) {
-		uint64_t count = vd->vdev_ms_count;
+		metaslab_group_t *mg = vd->vdev_mg;
+		metaslab_group_passivate(mg);

-		metaslab_group_passivate(vd->vdev_mg);
+		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
-
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
@ -1414,6 +1412,9 @@ vdev_metaslab_fini(vdev_t *vd)
 		vd->vdev_ms = NULL;

 		vd->vdev_ms_count = 0;
+
+		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+			ASSERT0(mg->mg_histogram[i]);
 	}
 	ASSERT0(vd->vdev_ms_count);
 }
@ -2767,13 +2768,6 @@ vdev_dtl_load(vdev_t *vd)
 		ASSERT(vd->vdev_dtl_sm != NULL);

 		mutex_enter(&vd->vdev_dtl_lock);
-
-		/*
-		 * Now that we've opened the space_map we need to update
-		 * the in-core DTL.
-		 */
-		space_map_update(vd->vdev_dtl_sm);
-
 		error = space_map_load(vd->vdev_dtl_sm,
 		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
 		mutex_exit(&vd->vdev_dtl_lock);
@ -2933,10 +2927,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	}

 	dmu_tx_commit(tx);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_update(vd->vdev_dtl_sm);
-	mutex_exit(&vd->vdev_dtl_lock);
 }

 /*
@ -3079,7 +3069,10 @@ vdev_load(vdev_t *vd)
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
-		} else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+		}
+
+		error = vdev_metaslab_init(vd, 0);
+		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@ -3093,9 +3086,10 @@ vdev_load(vdev_t *vd)
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);

-			if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
-			    vd->vdev_ashift))) {
+			    vd->vdev_ashift);
+			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
@ -3103,15 +3097,15 @@ vdev_load(vdev_t *vd)
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
-			space_map_update(vd->vdev_checkpoint_sm);

 			/*
 			 * Since the checkpoint_sm contains free entries
-			 * exclusively we can use sm_alloc to indicate the
-			 * culmulative checkpointed space that has been freed.
+			 * exclusively we can use space_map_allocated() to
+			 * indicate the cumulative checkpointed space that
+			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
-			    -vd->vdev_checkpoint_sm->sm_alloc;
+			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		}
@ -3143,7 +3137,6 @@ vdev_load(vdev_t *vd)
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
-		space_map_update(vd->vdev_obsolete_sm);
 	}

 	return (0);
@ -3230,47 +3223,6 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));

-	if (vd->vdev_ms != NULL) {
-		metaslab_group_t *mg = vd->vdev_mg;
-
-		metaslab_group_histogram_verify(mg);
-		metaslab_class_histogram_verify(mg->mg_class);
-
-		for (int m = 0; m < vd->vdev_ms_count; m++) {
-			metaslab_t *msp = vd->vdev_ms[m];
-
-			if (msp == NULL || msp->ms_sm == NULL)
-				continue;
-
-			mutex_enter(&msp->ms_lock);
-			/*
-			 * If the metaslab was not loaded when the vdev
-			 * was removed then the histogram accounting may
-			 * not be accurate. Update the histogram information
-			 * here so that we ensure that the metaslab group
-			 * and metaslab class are up-to-date.
-			 */
-			metaslab_group_histogram_remove(mg, msp);
-
-			VERIFY0(space_map_allocated(msp->ms_sm));
-			space_map_close(msp->ms_sm);
-			msp->ms_sm = NULL;
-			mutex_exit(&msp->ms_lock);
-		}
-
-		if (vd->vdev_checkpoint_sm != NULL) {
-			ASSERT(spa_has_checkpoint(spa));
-			space_map_close(vd->vdev_checkpoint_sm);
-			vd->vdev_checkpoint_sm = NULL;
-		}
-
-		metaslab_group_histogram_verify(mg);
-		metaslab_class_histogram_verify(mg->mg_class);
-
-		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
-			ASSERT0(mg->mg_histogram[i]);
-	}
-
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);

 	vdev_destroy_spacemaps(vd, tx);
@ -3304,17 +3256,14 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
-	dmu_tx_t *tx;

+	ASSERT3U(txg, ==, spa->spa_syncing_txg);
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
-		dmu_tx_t *tx;
-
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);

-		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		vdev_indirect_sync_obsolete(vd, tx);
-		dmu_tx_commit(tx);

 		/*
 		 * If the vdev is indirect, it can't have dirty
@ -3323,6 +3272,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+			dmu_tx_commit(tx);
 			return;
 		}
 	}
@ -3333,12 +3283,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
-		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
-		dmu_tx_commit(tx);
 	}

 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@ -3357,6 +3305,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		vdev_remove_empty_log(vd, txg);

 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+	dmu_tx_commit(tx);
 }

 uint64_t
@ -3586,8 +3535,6 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
-				ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
-				    !=, 0);
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@ -680,7 +680,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)

 	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
-	space_map_update(prev_obsolete_sm);
 	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
 	if (prev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
@ -831,7 +830,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
 		    spa->spa_meta_objset, obsolete_sm_object,
 		    0, vd->vdev_asize, 0));
-		space_map_update(vd->vdev_obsolete_sm);
 	}

 	ASSERT(vd->vdev_obsolete_sm != NULL);
@ -840,7 +838,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)

 	space_map_write(vd->vdev_obsolete_sm,
 	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
-	space_map_update(vd->vdev_obsolete_sm);
 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 }

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
@ -557,6 +557,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
 	losma.losma_counts = counts;
 	losma.losma_vim = vim;
 	VERIFY0(space_map_iterate(obsolete_space_sm,
+	    space_map_length(obsolete_space_sm),
 	    load_obsolete_sm_callback, &losma));
 }

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
@ -442,7 +442,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		mutex_enter(&msp->ms_lock);

 		uint64_t ms_free = msp->ms_size -
-		    space_map_allocated(msp->ms_sm);
+		    metaslab_allocated_space(msp);

 		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
 			ms_free /= vd->vdev_top->vdev_children;
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
@ -283,15 +283,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 		if (ms->ms_sm == NULL)
 			continue;

-		/*
-		 * Sync tasks happen before metaslab_sync(), therefore
-		 * smp_alloc and sm_alloc must be the same.
-		 */
-		ASSERT3U(space_map_allocated(ms->ms_sm), ==,
-		    ms->ms_sm->sm_phys->smp_alloc);
-
 		spa->spa_removing_phys.sr_to_copy +=
-		    space_map_allocated(ms->ms_sm);
+		    metaslab_allocated_space(ms);

 		/*
 		 * Space which we are freeing this txg does not need to
@ -1401,22 +1394,8 @@ spa_vdev_remove_thread(void *arg)
 		 * appropriate action (see free_from_removing_vdev()).
 		 */
 		if (msp->ms_sm != NULL) {
-			space_map_t *sm = NULL;
-
-			/*
-			 * We have to open a new space map here, because
-			 * ms_sm's sm_length and sm_alloc may not reflect
-			 * what's in the object contents, if we are in between
-			 * metaslab_sync() and metaslab_sync_done().
-			 */
-			VERIFY0(space_map_open(&sm,
-			    spa->spa_dsl_pool->dp_meta_objset,
-			    msp->ms_sm->sm_object, msp->ms_sm->sm_start,
-			    msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
-			space_map_update(sm);
-			VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
-			    SM_ALLOC));
-			space_map_close(sm);
+			VERIFY0(space_map_load(msp->ms_sm,
+			    svr->svr_allocd_segs, SM_ALLOC));

 			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
@ -1612,16 +1591,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 		ASSERT0(range_tree_space(msp->ms_freed));

 		if (msp->ms_sm != NULL) {
-			/*
-			 * Assert that the in-core spacemap has the same
-			 * length as the on-disk one, so we can use the
-			 * existing in-core spacemap to load it from disk.
-			 */
-			ASSERT3U(msp->ms_sm->sm_alloc, ==,
-			    msp->ms_sm->sm_phys->smp_alloc);
-			ASSERT3U(msp->ms_sm->sm_length, ==,
-			    msp->ms_sm->sm_phys->smp_objsize);
-
 			mutex_enter(&svr->svr_lock);
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
@ -1714,9 +1683,6 @@ spa_vdev_remove_cancel(spa_t *spa)
 	return (error);
 }

-/*
- * Called every sync pass of every txg if there's a svr.
- */
 void
 svr_sync(spa_t *spa, dmu_tx_t *tx)
 {
@ -1780,6 +1746,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)

 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));

 	/*
 	 * Stop allocating from this vdev.
@ -1794,15 +1761,14 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);

 	/*
-	 * Evacuate the device.  We don't hold the config lock as writer
-	 * since we need to do I/O but we do keep the
+	 * Evacuate the device.  We don't hold the config lock as
+	 * writer since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
-	if (vd->vdev_islog) {
-		if (vd->vdev_stat.vs_alloc != 0)
-			error = spa_reset_logs(spa);
-	}
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (vd->vdev_stat.vs_alloc != 0)
+		error = spa_reset_logs(spa);

 	*txg = spa_vdev_config_enter(spa);

@ -1821,6 +1787,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);

+	vdev_metaslab_fini(vd);
+
 	spa_history_log_internal(spa, "vdev remove", NULL,
 	    "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
@ -1850,6 +1818,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);

+	ASSERT0(vd->vdev_stat.vs_alloc);
+
 	/*
 	 * Clean up the vdev namespace.
 	 */